
Clustering methods compute a similarity (or closeness) measure, such as Euclidean distance, in order to decide whether two documents are ‘similar’ or not.
Use the entire class corpus and do sentiment analysis for the positive and negative reviews.
Topic modeling is another way to group ‘similar’ documents into ‘clusters’.
import pandas as pd
import os
import numpy as np
import re
import string
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import random
from dataclasses import dataclass
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import gensim
from gensim import corpora, similarities
from gensim.models import Word2Vec, LdaMulticore, TfidfModel, CoherenceModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import LsiModel,LdaModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE, MDS
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import MultinomialNB
import scipy.cluster.hierarchy
from IPython.display import display, HTML
from typing import List, Callable, Dict
#from google.colab import drive
#drive.mount('/content/gdrive')
# Only run this once, they will be downloaded.
nltk.download('stopwords',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('omw-1.4',quiet=True)
True
print("Genism Version: ", gensim.__version__)
Genism Version: 4.3.2
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
# COMMENTED OUT FUNCTIONS HERE NOT REFERENCED BY OTHER PARTS OF NOTEBOOK
def add_movie_descriptor(data: pd.DataFrame, corpus_df: pd.DataFrame):
"""
Adds "Movie Description" to the supplied dataframe, in the form {Genre}_{P|N}_{Movie Title}_{DocID}
"""
review = np.where(corpus_df['Review Type (pos or neg)'] == 'Positive', 'P', 'N')
data['Descriptor'] = corpus_df['Genre of Movie'] + '_' + corpus_df['Movie Title'] + '_' + review + '_' + corpus_df['Doc_ID'].astype(str)
def get_corpus_df(path):
data = pd.read_csv(path, encoding="utf-8")
add_movie_descriptor(data, data)
sorted_data = data.sort_values(['Descriptor'])
indexed_data = sorted_data.set_index(['Doc_ID'])
indexed_data['Doc_ID'] = indexed_data.index
return indexed_data
@dataclass
class Document:
doc_id: str
text: str
# def normalize_document(document: Document) -> Document:
# text = document.text
# text = remove_punctuation(text)
# text = lower_case(text)
# text = remove_tags(text)
# text = remove_special_chars_and_digits(text)
# return Document(document.doc_id, text)
# def normalize_documents(documents: List[Document]) -> List[Document]:
# """
# Normalizes text for all given documents.
# Removes punctuation, converts to lower case, removes tags and special characters.
# """
# return [normalize_document(x) for x in documents]
@dataclass
class TokenizedDocument:
doc_id: str
tokens: List[str]
# def tokenize_document(document: Document) -> TokenizedDocument:
# tokens = nltk.word_tokenize(document.text)
# return TokenizedDocument(document.doc_id, tokens)
# def tokenize_documents(documents: List[Document]) -> List[TokenizedDocument]:
# return [tokenize_document(x) for x in documents]
# def lemmatize(documents: List[TokenizedDocument]) -> List[TokenizedDocument]:
# result = []
# lemmatizer = WordNetLemmatizer()
# for document in documents:
# output_tokens = [lemmatizer.lemmatize(w) for w in document.tokens]
# result.append(TokenizedDocument(document.doc_id, output_tokens))
#
# return result
# def stem(documents: List[TokenizedDocument]) -> List[TokenizedDocument]:
# result = []
# stemmer = PorterStemmer()
# for document in documents:
# output_tokens = [stemmer.stem(w) for w in document.tokens]
# result.append(TokenizedDocument(document.doc_id, output_tokens))
#
# return result
# def remove_stop_words(documents: List[TokenizedDocument]) -> List[TokenizedDocument]:
# result = []
#
# stop_words = set(nltk.corpus.stopwords.words('english'))
# for document in documents:
# filtered_tokens = [w for w in document.tokens if not w in stop_words]
# result.append(TokenizedDocument(document.doc_id, filtered_tokens))
#
# return result
# def add_flags(data: pd.DataFrame, casino_royale_doc_ids: List[int], action_doc_ids: List[int]):
# data['is_casino_royale'] = data.index.isin(casino_royale_doc_ids)
# data['is_action'] = data.index.isin(action_doc_ids)
# def get_all_tokens(documents: List[TokenizedDocument]) -> List[str]:
# tokens = {y for x in documents for y in x.tokens}
# return sorted(list(tokens))
def clean_doc(doc):
#split document into individual words
tokens=doc.split()
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
tokens = [re_punc.sub('', w) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 4]
#lowercase all words
tokens = [word.lower() for word in tokens]
# filter out stop words
stop_words = set(stopwords.words('english'))
custom_stop_words = ['movies', 'movie', 'film', 'films', 'scene']
stop_words.update(custom_stop_words)
tokens = [w for w in tokens if not w in stop_words]
# word stemming
# ps=PorterStemmer()
# tokens=[ps.stem(word) for word in tokens]
return tokens
# def final_processed_text_disabled(doc):
# #this is a function to join the processed text back
# ' '.join(doc)
# return doc
def tfidf(corpus, titles, ngram_range = (1,1)):
#this is a function to created the tfidf matrix
Tfidf=TfidfVectorizer(ngram_range=(1,1))
#fit the vectorizer using final processed documents. The vectorizer requires the
#stiched back together document.
TFIDF_matrix=Tfidf.fit_transform(corpus)
#creating datafram from TFIDF Matrix
# https://stackoverflow.com/questions/70215049/attributeerror-tfidfvectorizer-object-has-no-attribute-get-feature-names-out
#words = Tfidf.get_feature_names() # For sklearn <= 0.24.x
#matrix=pd.DataFrame(TFIDF_matrix.toarray(), columns=Tfidf.get_feature_names(), index=titles)
words = Tfidf.get_feature_names_out() # For sklearn >= 1.0.x
matrix=pd.DataFrame(TFIDF_matrix.toarray(), columns=Tfidf.get_feature_names_out(), index=titles)
return matrix #,words
def One_Hot(variable):
#this is a function to one hot encode the classes
LE=LabelEncoder()
LE.fit(variable)
Label1=LE.transform(variable)
OHE=OneHotEncoder()
labels=OHE.fit_transform(Label1.reshape(-1,1)).toarray()
return labels
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
dictionary = corpora.Dictionary(doc_clean)
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
# generate LSA model
# train model
lsamodel = LsiModel(doc_term_matrix
,num_topics=number_of_topics
,id2word = dictionary
,power_iters=100)
print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
index = similarities.MatrixSimilarity(lsamodel[doc_term_matrix])
return lsamodel,dictionary,index
def lsa(tfidf_matrix, terms, n_components = 10):
#this is a function to execute lsa. inputs to the function include the tfidf matrix and
#the desired number of components.
LSA = TruncatedSVD(n_components=10)
LSA.fit(tfidf_matrix)
for i, comp in enumerate(LSA.components_):
terms_comp = zip(terms, comp)
sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
print("Topic "+str(i)+": ")
for t in sorted_terms:
print(t[0])
def create_gensim_lda_model(doc_clean,number_of_topics,words):
# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
dictionary = corpora.Dictionary(doc_clean)
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
# generate LDA model
ldamodel = LdaModel(doc_term_matrix
,num_topics=number_of_topics
,id2word = dictionary
,alpha='auto'
,eta='auto'
,iterations=100
,random_state=23
,passes=20)
# train model
print(ldamodel.print_topics(num_topics=number_of_topics, num_words=words))
index = similarities.MatrixSimilarity(ldamodel[doc_term_matrix])
return ldamodel,dictionary,index,doc_term_matrix
def lda(tfidf_matrix, terms, topics = 3, num_words = 10):
#this is a function to perform lda on the tfidf matrix. function varibales include:
#tfidf matrix, desired number of topic, and number of words per topic.
topics = 3
num_words = 10
lda = LatentDirichletAllocation(n_components=topics).fit(tfidf_matrix)
topic_dict = {}
for topic_num, topic in enumerate(lda.components_):
topic_dict[topic_num] = " ".join([terms[i]for i in topic.argsort()[:-num_words - 1:-1]])
print(topic_dict)
def word2vec(processed_text, size = 100):
#This is a function to generate the word2vec matrix. Input parameters include the
#tokenized text and matrix size
#word to vec
model_w2v = Word2Vec(processed_text, vector_size=100, window=5, min_count=1, workers=4)
#join all processed DSI words into single list
processed_text_w2v=[]
for i in processed_text:
for k in i:
processed_text_w2v.append(k)
#obtian all the unique words from DSI
w2v_words=list(set(processed_text_w2v))
#can also use the get_feature_names() from TFIDF to get the list of words
#w2v_words=Tfidf.get_feature_names()
#empty dictionary to store words with vectors
w2v_vectors={}
#for loop to obtain weights for each word
for i in w2v_words:
temp_vec=model_w2v.wv[i]
w2v_vectors[i]=temp_vec
#create a final dataframe to view word vectors
w2v_df=pd.DataFrame(w2v_vectors).transpose()
print(w2v_df)
return w2v_df
# # Experimental#1 code here will show you the movie titles for the topics
# # Uncomment all the code in this cell and it will override the above functions
# # Also switch ot the modified Experiment#1 code lines elsewhere to add the "titles" parameter
# # or you can ignore the commented lines for Experiment#1 code and continue your assignment.
# def create_gensim_lda_model(doc_clean,number_of_topics,words, titles):
# dictionary = corpora.Dictionary(doc_clean)
# doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
# ldamodel = LdaModel(doc_term_matrix ,num_topics=number_of_topics,id2word = dictionary,alpha='auto',
# eta='auto',iterations=100,random_state=23,passes=20)
# print(ldamodel.print_topics(num_topics=number_of_topics, num_words=words))
# index = similarities.MatrixSimilarity(ldamodel[doc_term_matrix])
# # THIS CODE BELOW IS EXPERIMENTAL#1 AND BARRY WANTS YOUR FEEDBACK IF THIS WORKS OR NOT
# # Method 1: Shows top 5 titles related to a topic; adjust the number accordingly.
# for topic in ldamodel.print_topics(num_topics=number_of_topics, num_words=words):
# topic_number = topic[0]
# sims = index[ldamodel[doc_term_matrix[topic_number]]]
# most_similar_titles = [titles[i[0]] for i in sorted(enumerate(sims), key=lambda item: -item[1])[:5]]
# print('Most similar titles for topic #{}: {}'.format(topic_number, most_similar_titles))
# # Method 2: Shows all titles sorted on most similar for each topic.
# # Note you get all titles for all topics!
# for document in doc_clean:
# doc_bow = dictionary.doc2bow(document)
# sims = index[ldamodel[doc_bow]]
# most_similar_topic = sorted(enumerate(sims), key=lambda item: -item[1])[0][0]
# #VERBOSE DEBUGGING: print('Document belongs to topic #{}'.format(most_similar_topic))
# topic_titles = {}
# for topic_number, topic in enumerate(ldamodel.print_topics(num_topics=number_of_topics, num_words=words)):
# sims = index[ldamodel[doc_term_matrix[topic_number]]]
# titles_for_topic = [titles[i[0]] for i in sorted(enumerate(sims), key=lambda item: -item[1])]
# topic_titles[topic_number] = titles_for_topic
# print('Titles for topic #{}: {}'.format(topic_number, titles_for_topic))
# return ldamodel,dictionary,index,doc_term_matrix
# ##########
# def create_gensim_lsa_model(doc_clean, number_of_topics, words, titles):
# dictionary = corpora.Dictionary(doc_clean)
# doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
# # generate LSA model
# lsamodel = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word=dictionary, power_iters=100)
# print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
# index = similarities.MatrixSimilarity(lsamodel[doc_term_matrix])
# # THIS CODE BELOW IS EXPERIMENTAL AND BARRY WANTS YOUR FEEDBACK IF THIS WORKS OR NOT
# # Method 1: Shows top 5 titles related to a topic; adjust the number accordingly.
# for topic in lsamodel.print_topics(num_topics=number_of_topics, num_words=words):
# topic_number = topic[0]
# sims = index[lsamodel[doc_term_matrix[topic_number]]]
# most_similar_titles = [titles[i[0]] for i in sorted(enumerate(sims), key=lambda item: -item[1])[:5]]
# print('Most similar titles for topic #{}: {}'.format(topic_number, most_similar_titles))
# # Method 2: Shows all titles sorted on most similar for each topic.
# # Note you get all titles for all topics!
# for document in doc_clean:
# doc_bow = dictionary.doc2bow(document)
# sims = index[lsamodel[doc_bow]]
# most_similar_topic = sorted(enumerate(sims), key=lambda item: -item[1])[0][0]
# #VERBOSE DEBUGGING: print('Document belongs to topic #{}'.format(most_similar_topic))
# topic_titles = {}
# for topic_number, topic in enumerate(lsamodel.print_topics(num_topics=number_of_topics, num_words=words)):
# sims = index[lsamodel[doc_term_matrix[topic_number]]]
# titles_for_topic = [titles[i[0]] for i in sorted(enumerate(sims), key=lambda item: -item[1])]
# topic_titles[topic_number] = titles_for_topic
# print('Titles for topic #{}: {}'.format(topic_number, titles_for_topic))
# return lsamodel, dictionary, index
# ###########
# #
# # When you call create_gensim_lda_model(...) later on, you have to add titles to the call. Eg:
# #
# # model2,dictionary2,index2,doctermmatrix2=create_gensim_lda_model(processed_text,number_of_topics,words,titles)
# #
# # Similarly for LSA...
def k_means(titles, tfidf_matrix, k=3):
#this is a function to generate the k-means output using the tfidf matrix. Inputs
#to the function include: titles of text, processed text, and desired k value.
#Returns dataframe indicating cluster number per document
km = KMeans(n_clusters=k, random_state =89)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()
Dictionary={'Doc Name':titles, 'Cluster':clusters, 'Text': final_processed_text}
frame=pd.DataFrame(Dictionary, columns=['Cluster', 'Doc Name','Text'])
#dictionary to store clusters and respective titles
cluster_title={}
#note doc2vec clusters will not have individual words due to the vector representation
#is based on the entire document not indvidual words. As a result, there won't be individual
#word outputs from each cluster.
for i in range(k):
temp=frame[frame['Cluster']==i]
temp_title_list=[]
for title in temp['Doc Name']:
temp_title_list.append(title)
cluster_title[i]=temp_title_list
# How to see the results of the k-means clustering
#print ("cluster_title", cluster_title)
#print ("clusters", clusters)
#print ("frame", frame)
return cluster_title,clusters,frame
def classifiers(x, y, model_type, cv = 3):
#this function is to fit 3 different model scenarios. Support vector machines, logistic regressions, naive bayes.
#svm = Support vector machin
#logistic = Logistic regression
#naive_bayes = Naive Bayes Multinomial
#can define cv value for cross validation.
#function returns the train test split scores of each model.
if model_type == 'svm':
print("svm")
model = SVC()
elif model_type == 'logistic':
print("logistic")
model = LogisticRegression()
elif model_type == 'naive_bayes':
print("naive_bayes")
model = MultinomialNB()
elif model_type == 'randomforest':
print("randomforest")
model = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.10, random_state=23)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
accy = accuracy_score(y_test, predictions)
return accy
def plot_lsa(number_of_topics, words):
# BARRYC experimental#1
#model,dictionary,index=create_gensim_lsa_model(processed_text,number_of_topics,words,titles)
model,dictionary,index=create_gensim_lsa_model(processed_text,number_of_topics,words)
for doc in processed_text:
vec_bow = dictionary.doc2bow(doc)
vec_lsi = model[vec_bow] # convert the query to LSI space
sims = index[vec_lsi] # perform a similarity query against the corpus
fig, ax = plt.subplots(figsize=(30, 20))
cax = ax.matshow(index, interpolation='nearest')
ax.grid(True)
plt.xticks(range(len(processed_text)), titles, rotation=90, fontsize=8);
plt.yticks(range(len(processed_text)), titles, fontsize=8);
fig.colorbar(cax)
plt.show()
return model
def plot_tfidf_matrix(cluster_title,clusters,TFIDF_matrix):
# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
dist = 1 - cosine_similarity(TFIDF_matrix)
pos = mds.fit_transform(dist) # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]
#set up colors per clusters using a dict. number of colors must correspond to K
cluster_colors = {0: 'black', 1: 'grey', 2: 'blue', 3: 'rosybrown', 4: 'firebrick',
5:'red', 6:'darksalmon', 7:'sienna'}
#set up cluster names using a dict.
cluster_dict=cluster_title
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=range(0,len(clusters))))
#group by cluster
groups = df.groupby('label')
fig, ax = plt.subplots(figsize=(20,20)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
r = random.random()
b = random.random()
g = random.random()
color = (r, g, b)
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
label=cluster_dict[name], color=color,
mec='none')
ax.set_aspect('auto')
ax.tick_params(\
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='on')
ax.tick_params(\
axis= 'y', # changes apply to the y-axis
which='both', # both major and minor ticks are affected
left='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelleft='on')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 30}) #show legend with only 1 point
CORPUS_PATH=\
'https://raw.githubusercontent.com/barrycforever/MSDS_453_Public/main/MSDS453_ClassCorpus/MSDS453_ClassCorpus_Final_Sec57_v2_20230928.csv'
data = get_corpus_df(CORPUS_PATH)
#adding two columns to the dataframe to store the processed text and tokenized text
data['processed_text'] = data['Text'].apply(lambda x: clean_doc(x))
#creating final processed text variables for matrix creation
final_processed_text = [' '.join(x) for x in data['processed_text'].tolist()]
titles = data['DSI_Title'].tolist()
processed_text = data['processed_text'].tolist()
CORPUS_PATH=\
'https://raw.githubusercontent.com/barrycforever/MSDS_453_Public/main/MSDS453_ClassCorpus/MSDS453_ClassCorpus_Final_Sec57_v2_20230928.csv'
corpus_df = get_corpus_df(CORPUS_PATH)
documents = [Document(x, y) for x, y in zip(corpus_df.Doc_ID, corpus_df.Text)]
corpus_df.shape
(200, 9)
corpus_df.head().T
| Doc_ID | 90 | 91 | 92 | 93 | 94 |
|---|---|---|---|---|---|
| DSI_Title | LKP_Doc1_Angel_has_fallen | LKP_Doc2_Angel_has_fallen | LKP_Doc3_Angel_has_fallen | LKP_Doc4_Angel_has_fallen | LKP_Doc5_Angel_has_fallen |
| Text | Target is on the move . '' Heavily outnumbered... | The sleepy , dopey action bonanza `` Angel Has... | `` Angel Has Fallen '' marks the third time th... | Improving through blandness . So Mike Banning ... | This review may contain spoilers . Angel Has F... |
| Submission File Name | LKP_Doc1_Angel_has_fallen | LKP_Doc2_Angel_has_fallen | LKP_Doc3_Angel_has_fallen | LKP_Doc4_Angel_has_fallen | LKP_Doc5_Angel_has_fallen |
| Student Name | LKP | LKP | LKP | LKP | LKP |
| Genre of Movie | Action | Action | Action | Action | Action |
| Review Type (pos or neg) | Negative | Negative | Negative | Negative | Negative |
| Movie Title | Angel_has_fallen | Angel_has_fallen | Angel_has_fallen | Angel_has_fallen | Angel_has_fallen |
| Descriptor | Action_Angel_has_fallen_N_90 | Action_Angel_has_fallen_N_91 | Action_Angel_has_fallen_N_92 | Action_Angel_has_fallen_N_93 | Action_Angel_has_fallen_N_94 |
| Doc_ID | 90 | 91 | 92 | 93 | 94 |
print(corpus_df.info());
<class 'pandas.core.frame.DataFrame'> Int64Index: 200 entries, 90 to 164 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 DSI_Title 200 non-null object 1 Text 200 non-null object 2 Submission File Name 200 non-null object 3 Student Name 200 non-null object 4 Genre of Movie 200 non-null object 5 Review Type (pos or neg) 200 non-null object 6 Movie Title 200 non-null object 7 Descriptor 200 non-null object 8 Doc_ID 200 non-null int64 dtypes: int64(1), object(8) memory usage: 15.6+ KB None
print(corpus_df['Movie Title'].unique())
['Angel_has_fallen' 'Inception' 'No_Time_to_Die' 'Taken' 'Taxi' 'Barbie' 'Dirty_Grandpa' 'Holmes_and_Watson' 'Lost City' 'Mean_Girls' 'Drag_Me_To_Hell' 'Fresh' 'It_Chapter_Two' 'The Conjuring 2' 'The_Others' 'EQUILIBRIUM' 'Minority_Report' 'Oblivion' 'Pitch_Black' 'The_Batman']
Exploring different numbers of...
LSA: 2, 6, 20 concepts and 10 words
LDA: 2, 6, 20 topics and 10 words
model_2concepts_10words=plot_lsa(2, 10)
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.174*"would" + 0.170*"story" + 0.163*"action" + 0.151*"first" + 0.141*"world" + 0.139*"characters" + 0.135*"character" + 0.129*"years" + 0.127*"could" + 0.125*"still"'), (1, '-0.567*"holmes" + -0.384*"watson" + -0.273*"ferrell" + -0.240*"reilly" + -0.228*"sherlock" + -0.162*"comedy" + -0.124*"brothers" + -0.090*"jokes" + 0.088*"action" + -0.086*"moriarty"')]
model_6concepts_10words=plot_lsa(4, 10)
Output hidden; open in https://colab.research.google.com to view.
model_20concepts_10words=plot_lsa(20, 10)
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '-0.174*"would" + -0.170*"story" + -0.163*"action" + -0.151*"first" + -0.141*"world" + -0.139*"characters" + -0.135*"character" + -0.129*"years" + -0.127*"could" + -0.125*"still"'), (1, '0.567*"holmes" + 0.384*"watson" + 0.273*"ferrell" + 0.240*"reilly" + 0.228*"sherlock" + 0.162*"comedy" + 0.124*"brothers" + 0.090*"jokes" + -0.088*"action" + 0.086*"moriarty"'), (2, '-0.713*"barbie" + -0.167*"world" + -0.130*"girls" + -0.120*"school" + 0.119*"batman" + -0.115*"robbie" + -0.114*"women" + 0.092*"house" + -0.092*"daughter" + 0.091*"fallen"'), (3, '0.321*"fallen" + 0.255*"president" + -0.253*"house" + 0.235*"banning" + 0.226*"action" + 0.206*"angel" + -0.179*"horror" + -0.172*"grace" + -0.158*"children" + 0.126*"butler"'), (4, '0.450*"school" + 0.365*"girls" + -0.242*"barbie" + 0.177*"janis" + 0.167*"regina" + 0.158*"lohan" + -0.146*"batman" + 0.143*"plastics" + 0.128*"queen" + 0.117*"cliques"'), (5, '-0.439*"batman" + 0.233*"fallen" + 0.213*"house" + 0.199*"president" + 0.186*"banning" + -0.182*"reeves" + 0.156*"angel" + 0.131*"grace" + -0.119*"gotham" + 0.117*"barbie"'), (6, '0.433*"batman" + 0.182*"reeves" + 0.154*"barbie" + 0.152*"fallen" + 0.147*"house" + 0.129*"president" + -0.121*"black" + -0.121*"riddick" + -0.120*"people" + 0.120*"gotham"'), (7, '0.245*"horror" + 0.218*"raimi" + 0.205*"christine" + -0.156*"cruise" + -0.153*"oblivion" + 0.146*"fresh" + -0.141*"house" + -0.136*"grace" + -0.134*"earth" + -0.128*"planet"'), (8, '0.256*"loretta" + -0.218*"horror" + 0.178*"bullock" + 0.174*"romance" + -0.164*"raimi" + -0.147*"christine" + 0.130*"house" + 0.125*"grace" + 0.125*"action" + 0.115*"daniel"'), (9, '0.239*"loretta" + -0.198*"action" + -0.186*"really" + 0.168*"bullock" + 0.167*"oblivion" + 0.156*"cruise" + -0.147*"inception" + 0.144*"romance" + -0.141*"french" + 0.139*"earth"'), (10, '-0.399*"jason" + -0.316*"grandpa" + -0.286*"dirty" + -0.179*"efron" + -0.158*"grandson" + -0.131*"grandfather" + -0.119*"robert" + -0.114*"lawyer" + -0.108*"plaza" + -0.107*"kelly"'), (11, '0.185*"chapter" + 0.179*"characters" + -0.177*"fresh" + -0.169*"steve" + 0.169*"riddick" + -0.151*"women" + 0.148*"conjuring" + 0.132*"james" + -0.128*"dating" + 0.127*"character"'), (12, '-0.235*"riddick" + -0.225*"black" + 0.193*"chapter" + -0.188*"pitch" + 0.137*"james" + -0.132*"planet" + 0.121*"action" + 0.119*"pennywise" + 0.117*"inception" + 0.115*"nolan"'), (13, '-0.292*"raimi" + 0.286*"fresh" + -0.280*"christine" + 0.255*"steve" + 0.193*"dating" + 0.167*"women" + -0.136*"lohman" + -0.119*"gypsy" + 0.117*"edgarjones" + -0.116*"would"'), (14, '0.327*"conjuring" + -0.252*"chapter" + 0.196*"lorraine" + -0.148*"pennywise" + -0.140*"grace" + 0.129*"warrens" + 0.121*"janet" + 0.118*"farmiga" + 0.113*"hodgson" + -0.104*"muschietti"'), (15, '0.319*"preston" + 0.257*"emotions" + 0.189*"equilibrium" + -0.167*"inception" + -0.160*"nolan" + -0.130*"oblivion" + 0.129*"world" + 0.120*"called" + 0.119*"christian" + -0.115*"story"'), (16, '0.343*"craig" + -0.204*"anderton" + -0.170*"story" + -0.167*"spielberg" + 0.163*"james" + -0.158*"would" + 0.152*"daniel" + 0.135*"great" + -0.123*"report" + 0.123*"madeleine"'), (17, '0.315*"craig" + 0.202*"would" + 0.201*"anderton" + 0.177*"spielberg" + 0.143*"character" + 0.142*"report" + 0.140*"minority" + -0.134*"action" + -0.133*"great" + 0.132*"james"'), (18, '-0.364*"inception" + -0.316*"nolan" + 0.189*"action" + 0.175*"french" + 0.154*"daughter" + -0.145*"characters" + -0.129*"character" + -0.121*"emotion" + -0.120*"preston" + 0.108*"people"'), (19, '0.278*"daughter" + -0.205*"french" + -0.181*"great" + 0.175*"neeson" + -0.172*"story" + 0.168*"paris" + 0.136*"taken" + -0.125*"comedy" + -0.107*"anderton" + -0.101*"daniel"')]
topics = [i for i in range(2,20)]
coherence_values = []
for t in topics:
#BARRYC EXPERIMENTAL#1
# lsamodel,dictionary,index = create_gensim_lsa_model(processed_text,t,10, titles)
lsamodel,dictionary,index = create_gensim_lsa_model(processed_text,t,10)
coherence_model_lsa = CoherenceModel(model=lsamodel, dictionary=dictionary, texts=processed_text, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()
coherence_values.append(coherence_lsa)
# coherence ={'4 concepts 10 words':coherence_values[0],
# '2 concepts 10 words': coherence_values[1],
# '20 concepts 10 words':coherence_values[2]}
coherence = {f"{i} concepts 10 words": coherence_values[i-2] for i in topics}
print(coherence)
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.174*"would" + 0.170*"story" + 0.163*"action" + 0.151*"first" + 0.141*"world" + 0.139*"characters" + 0.135*"character" + 0.129*"years" + 0.127*"could" + 0.125*"still"'), (1, '0.567*"holmes" + 0.384*"watson" + 0.273*"ferrell" + 0.240*"reilly" + 0.228*"sherlock" + 0.162*"comedy" + 0.124*"brothers" + 0.090*"jokes" + -0.088*"action" + 0.086*"moriarty"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.174*"would" + 0.170*"story" + 0.163*"action" + 0.151*"first" + 0.141*"world" + 0.139*"characters" + 0.135*"character" + 0.129*"years" + 0.127*"could" + 0.125*"still"'), (1, '0.567*"holmes" + 0.384*"watson" + 0.273*"ferrell" + 0.240*"reilly" + 0.228*"sherlock" + 0.162*"comedy" + 0.124*"brothers" + 0.090*"jokes" + -0.088*"action" + 0.086*"moriarty"'), (2, '0.713*"barbie" + 0.167*"world" + 0.130*"girls" + 0.120*"school" + -0.119*"batman" + 0.115*"robbie" + 0.114*"women" + -0.092*"house" + 0.092*"daughter" + -0.091*"fallen"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.174*"would" + 0.170*"story" + 0.163*"action" + 0.151*"first" + 0.141*"world" + 0.139*"characters" + 0.135*"character" + 0.129*"years" + 0.127*"could" + 0.125*"still"'), (1, '-0.567*"holmes" + -0.384*"watson" + -0.273*"ferrell" + -0.240*"reilly" + -0.228*"sherlock" + -0.162*"comedy" + -0.124*"brothers" + -0.090*"jokes" + 0.088*"action" + -0.086*"moriarty"'), (2, '0.713*"barbie" + 0.167*"world" + 0.130*"girls" + 0.120*"school" + -0.119*"batman" + 0.115*"robbie" + 0.114*"women" + -0.092*"house" + 0.092*"daughter" + -0.091*"fallen"'), (3, '0.321*"fallen" + 0.255*"president" + -0.253*"house" + 0.235*"banning" + 0.226*"action" + 0.206*"angel" + -0.179*"horror" + -0.172*"grace" + -0.158*"children" + 0.126*"butler"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.174*"would" + 0.170*"story" + 0.163*"action" + 0.151*"first" + 0.141*"world" + 0.139*"characters" + 0.135*"character" + 0.129*"years" + 0.127*"could" + 0.125*"still"'), (1, '0.567*"holmes" + 0.384*"watson" + 0.273*"ferrell" + 0.240*"reilly" + 0.228*"sherlock" + 0.162*"comedy" + 0.124*"brothers" + 0.090*"jokes" + -0.088*"action" + 0.086*"moriarty"'), (2, '0.713*"barbie" + 0.167*"world" + 0.130*"girls" + 0.120*"school" + -0.119*"batman" + 0.115*"robbie" + 0.114*"women" + -0.092*"house" + 0.092*"daughter" + -0.091*"fallen"'), (3, '0.321*"fallen" + 0.255*"president" + -0.253*"house" + 0.235*"banning" + 0.226*"action" + 0.206*"angel" + -0.179*"horror" + -0.172*"grace" + -0.158*"children" + 0.126*"butler"'), (4, '-0.450*"school" + -0.365*"girls" + 0.242*"barbie" + -0.177*"janis" + -0.167*"regina" + -0.158*"lohan" + 0.146*"batman" + -0.143*"plastics" + -0.128*"queen" + -0.117*"cliques"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.174*"would" + 0.170*"story" + 0.163*"action" + 0.151*"first" + 0.141*"world" + 0.139*"characters" + 0.135*"character" + 0.129*"years" + 0.127*"could" + 0.125*"still"'), (1, '-0.567*"holmes" + -0.384*"watson" + -0.273*"ferrell" + -0.240*"reilly" + -0.228*"sherlock" + -0.162*"comedy" + -0.124*"brothers" + -0.090*"jokes" + 0.088*"action" + -0.086*"moriarty"'), (2, '-0.713*"barbie" + -0.167*"world" + -0.130*"girls" + -0.120*"school" + 0.119*"batman" + -0.115*"robbie" + -0.114*"women" + 0.092*"house" + -0.092*"daughter" + 0.091*"fallen"'), (3, '0.321*"fallen" + 0.255*"president" + -0.253*"house" + 0.235*"banning" + 0.226*"action" + 0.206*"angel" + -0.179*"horror" + -0.172*"grace" + -0.158*"children" + 0.126*"butler"'), (4, '-0.450*"school" + -0.365*"girls" + 0.242*"barbie" + -0.177*"janis" + -0.167*"regina" + -0.158*"lohan" + 0.146*"batman" + -0.143*"plastics" + -0.128*"queen" + -0.117*"cliques"'), (5, '0.439*"batman" + -0.233*"fallen" + -0.213*"house" + -0.199*"president" + -0.186*"banning" + 0.182*"reeves" + -0.156*"angel" + -0.131*"grace" + 0.119*"gotham" + -0.117*"barbie"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.174*"would" + 0.170*"story" + 0.163*"action" + 0.151*"first" + 0.141*"world" + 0.139*"characters" + 0.135*"character" + 0.129*"years" + 0.127*"could" + 0.125*"still"'), (1, '0.567*"holmes" + 0.384*"watson" + 0.273*"ferrell" + 0.240*"reilly" + 0.228*"sherlock" + 0.162*"comedy" + 0.124*"brothers" + 0.090*"jokes" + -0.088*"action" + 0.086*"moriarty"'), (2, '-0.713*"barbie" + -0.167*"world" + -0.130*"girls" + -0.120*"school" + 0.119*"batman" + -0.115*"robbie" + -0.114*"women" + 0.092*"house" + -0.092*"daughter" + 0.091*"fallen"'), (3, '-0.321*"fallen" + -0.255*"president" + 0.253*"house" + -0.235*"banning" + -0.226*"action" + -0.206*"angel" + 0.179*"horror" + 0.172*"grace" + 0.158*"children" + -0.126*"butler"'), (4, '0.450*"school" + 0.365*"girls" + -0.242*"barbie" + 0.177*"janis" + 0.167*"regina" + 0.158*"lohan" + -0.146*"batman" + 0.143*"plastics" + 0.128*"queen" + 0.117*"cliques"'), (5, '0.439*"batman" + -0.233*"fallen" + -0.213*"house" + -0.199*"president" + -0.186*"banning" + 0.182*"reeves" + -0.156*"angel" + -0.131*"grace" + 0.119*"gotham" + -0.117*"barbie"'), (6, '-0.433*"batman" + -0.182*"reeves" + -0.154*"barbie" + -0.152*"fallen" + -0.147*"house" + -0.129*"president" + 0.121*"black" + 0.121*"riddick" + 0.120*"people" + -0.120*"gotham"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.174*"would" + 0.170*"story" + 0.163*"action" + 0.151*"first" + 0.141*"world" + 0.139*"characters" + 0.135*"character" + 0.129*"years" + 0.127*"could" + 0.125*"still"'), (1, '-0.567*"holmes" + -0.384*"watson" + -0.273*"ferrell" + -0.240*"reilly" + -0.228*"sherlock" + -0.162*"comedy" + -0.124*"brothers" + -0.090*"jokes" + 0.088*"action" + -0.086*"moriarty"'), (2, '0.713*"barbie" + 0.167*"world" + 0.130*"girls" + 0.120*"school" + -0.119*"batman" + 0.115*"robbie" + 0.114*"women" + -0.092*"house" + 0.092*"daughter" + -0.091*"fallen"'), (3, '-0.321*"fallen" + -0.255*"president" + 0.253*"house" + -0.235*"banning" + -0.226*"action" + -0.206*"angel" + 0.179*"horror" + 0.172*"grace" + 0.158*"children" + -0.126*"butler"'), (4, '0.450*"school" + 0.365*"girls" + -0.242*"barbie" + 0.177*"janis" + 0.167*"regina" + 0.158*"lohan" + -0.146*"batman" + 0.143*"plastics" + 0.128*"queen" + 0.117*"cliques"'), (5, '0.439*"batman" + -0.233*"fallen" + -0.213*"house" + -0.199*"president" + -0.186*"banning" + 0.182*"reeves" + -0.156*"angel" + -0.131*"grace" + 0.119*"gotham" + -0.117*"barbie"'), (6, '-0.433*"batman" + -0.182*"reeves" + -0.154*"barbie" + -0.152*"fallen" + -0.147*"house" + -0.129*"president" + 0.121*"black" + 0.121*"riddick" + 0.120*"people" + -0.120*"gotham"'), (7, '0.245*"horror" + 0.218*"raimi" + 0.205*"christine" + -0.156*"cruise" + -0.153*"oblivion" + 0.146*"fresh" + -0.141*"house" + -0.136*"grace" + -0.134*"earth" + -0.128*"planet"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.174*"would" + 0.170*"story" + 0.163*"action" + 0.151*"first" + 0.141*"world" + 0.139*"characters" + 0.135*"character" + 0.129*"years" + 0.127*"could" + 0.125*"still"'), (1, '-0.567*"holmes" + -0.384*"watson" + -0.273*"ferrell" + -0.240*"reilly" + -0.228*"sherlock" + -0.162*"comedy" + -0.124*"brothers" + -0.090*"jokes" + 0.088*"action" + -0.086*"moriarty"'), (2, '0.713*"barbie" + 0.167*"world" + 0.130*"girls" + 0.120*"school" + -0.119*"batman" + 0.115*"robbie" + 0.114*"women" + -0.092*"house" + 0.092*"daughter" + -0.091*"fallen"'), (3, '-0.321*"fallen" + -0.255*"president" + 0.253*"house" + -0.235*"banning" + -0.226*"action" + -0.206*"angel" + 0.179*"horror" + 0.172*"grace" + 0.158*"children" + -0.126*"butler"'), (4, '-0.450*"school" + -0.365*"girls" + 0.242*"barbie" + -0.177*"janis" + -0.167*"regina" + -0.158*"lohan" + 0.146*"batman" + -0.143*"plastics" + -0.128*"queen" + -0.117*"cliques"'), (5, '-0.439*"batman" + 0.233*"fallen" + 0.213*"house" + 0.199*"president" + 0.186*"banning" + -0.182*"reeves" + 0.156*"angel" + 0.131*"grace" + -0.119*"gotham" + 0.117*"barbie"'), (6, '0.433*"batman" + 0.182*"reeves" + 0.154*"barbie" + 0.152*"fallen" + 0.147*"house" + 0.129*"president" + -0.121*"black" + -0.121*"riddick" + -0.120*"people" + 0.120*"gotham"'), (7, '-0.245*"horror" + -0.218*"raimi" + -0.205*"christine" + 0.156*"cruise" + 0.153*"oblivion" + -0.146*"fresh" + 0.141*"house" + 0.136*"grace" + 0.134*"earth" + 0.128*"planet"'), (8, '0.256*"loretta" + -0.218*"horror" + 0.178*"bullock" + 0.174*"romance" + -0.164*"raimi" + -0.147*"christine" + 0.130*"house" + 0.125*"grace" + 0.125*"action" + 0.115*"daniel"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '-0.174*"would" + -0.170*"story" + -0.163*"action" + -0.151*"first" + -0.141*"world" + -0.139*"characters" + -0.135*"character" + -0.129*"years" + -0.127*"could" + -0.125*"still"'), (1, '-0.567*"holmes" + -0.384*"watson" + -0.273*"ferrell" + -0.240*"reilly" + -0.228*"sherlock" + -0.162*"comedy" + -0.124*"brothers" + -0.090*"jokes" + 0.088*"action" + -0.086*"moriarty"'), (2, '-0.713*"barbie" + -0.167*"world" + -0.130*"girls" + -0.120*"school" + 0.119*"batman" + -0.115*"robbie" + -0.114*"women" + 0.092*"house" + -0.092*"daughter" + 0.091*"fallen"'), (3, '-0.321*"fallen" + -0.255*"president" + 0.253*"house" + -0.235*"banning" + -0.226*"action" + -0.206*"angel" + 0.179*"horror" + 0.172*"grace" + 0.158*"children" + -0.126*"butler"'), (4, '0.450*"school" + 0.365*"girls" + -0.242*"barbie" + 0.177*"janis" + 0.167*"regina" + 0.158*"lohan" + -0.146*"batman" + 0.143*"plastics" + 0.128*"queen" + 0.117*"cliques"'), (5, '-0.439*"batman" + 0.233*"fallen" + 0.213*"house" + 0.199*"president" + 0.186*"banning" + -0.182*"reeves" + 0.156*"angel" + 0.131*"grace" + -0.119*"gotham" + 0.117*"barbie"'), (6, '0.433*"batman" + 0.182*"reeves" + 0.154*"barbie" + 0.152*"fallen" + 0.147*"house" + 0.129*"president" + -0.121*"black" + -0.121*"riddick" + -0.120*"people" + 0.120*"gotham"'), (7, '0.245*"horror" + 0.218*"raimi" + 0.205*"christine" + -0.156*"cruise" + -0.153*"oblivion" + 0.146*"fresh" + -0.141*"house" + -0.136*"grace" + -0.134*"earth" + -0.128*"planet"'), (8, '0.256*"loretta" + -0.218*"horror" + 0.178*"bullock" + 0.174*"romance" + -0.164*"raimi" + -0.147*"christine" + 0.130*"house" + 0.125*"grace" + 0.125*"action" + 0.115*"daniel"'), (9, '-0.239*"loretta" + 0.198*"action" + 0.186*"really" + -0.168*"bullock" + -0.167*"oblivion" + -0.156*"cruise" + 0.147*"inception" + -0.144*"romance" + 0.141*"french" + -0.139*"earth"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '-0.174*"would" + -0.170*"story" + -0.163*"action" + -0.151*"first" + -0.141*"world" + -0.139*"characters" + -0.135*"character" + -0.129*"years" + -0.127*"could" + -0.125*"still"'), (1, '0.567*"holmes" + 0.384*"watson" + 0.273*"ferrell" + 0.240*"reilly" + 0.228*"sherlock" + 0.162*"comedy" + 0.124*"brothers" + 0.090*"jokes" + -0.088*"action" + 0.086*"moriarty"'), (2, '0.713*"barbie" + 0.167*"world" + 0.130*"girls" + 0.120*"school" + -0.119*"batman" + 0.115*"robbie" + 0.114*"women" + -0.092*"house" + 0.092*"daughter" + -0.091*"fallen"'), (3, '-0.321*"fallen" + -0.255*"president" + 0.253*"house" + -0.235*"banning" + -0.226*"action" + -0.206*"angel" + 0.179*"horror" + 0.172*"grace" + 0.158*"children" + -0.126*"butler"'), (4, '0.450*"school" + 0.365*"girls" + -0.242*"barbie" + 0.177*"janis" + 0.167*"regina" + 0.158*"lohan" + -0.146*"batman" + 0.143*"plastics" + 0.128*"queen" + 0.117*"cliques"'), (5, '0.439*"batman" + -0.233*"fallen" + -0.213*"house" + -0.199*"president" + -0.186*"banning" + 0.182*"reeves" + -0.156*"angel" + -0.131*"grace" + 0.119*"gotham" + -0.117*"barbie"'), (6, '-0.433*"batman" + -0.182*"reeves" + -0.154*"barbie" + -0.152*"fallen" + -0.147*"house" + -0.129*"president" + 0.121*"black" + 0.121*"riddick" + 0.120*"people" + -0.120*"gotham"'), (7, '-0.245*"horror" + -0.218*"raimi" + -0.205*"christine" + 0.156*"cruise" + 0.153*"oblivion" + -0.146*"fresh" + 0.141*"house" + 0.136*"grace" + 0.134*"earth" + 0.128*"planet"'), (8, '0.256*"loretta" + -0.218*"horror" + 0.178*"bullock" + 0.174*"romance" + -0.164*"raimi" + -0.147*"christine" + 0.130*"house" + 0.125*"grace" + 0.125*"action" + 0.115*"daniel"'), (9, '0.239*"loretta" + -0.198*"action" + -0.186*"really" + 0.168*"bullock" + 0.167*"oblivion" + 0.156*"cruise" + -0.147*"inception" + 0.144*"romance" + -0.141*"french" + 0.139*"earth"'), (10, '-0.399*"jason" + -0.316*"grandpa" + -0.286*"dirty" + -0.179*"efron" + -0.158*"grandson" + -0.131*"grandfather" + -0.119*"robert" + -0.114*"lawyer" + -0.108*"plaza" + -0.107*"kelly"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '-0.174*"would" + -0.170*"story" + -0.163*"action" + -0.151*"first" + -0.141*"world" + -0.139*"characters" + -0.135*"character" + -0.129*"years" + -0.127*"could" + -0.125*"still"'), (1, '0.567*"holmes" + 0.384*"watson" + 0.273*"ferrell" + 0.240*"reilly" + 0.228*"sherlock" + 0.162*"comedy" + 0.124*"brothers" + 0.090*"jokes" + -0.088*"action" + 0.086*"moriarty"'), (2, '-0.713*"barbie" + -0.167*"world" + -0.130*"girls" + -0.120*"school" + 0.119*"batman" + -0.115*"robbie" + -0.114*"women" + 0.092*"house" + -0.092*"daughter" + 0.091*"fallen"'), (3, '0.321*"fallen" + 0.255*"president" + -0.253*"house" + 0.235*"banning" + 0.226*"action" + 0.206*"angel" + -0.179*"horror" + -0.172*"grace" + -0.158*"children" + 0.126*"butler"'), (4, '-0.450*"school" + -0.365*"girls" + 0.242*"barbie" + -0.177*"janis" + -0.167*"regina" + -0.158*"lohan" + 0.146*"batman" + -0.143*"plastics" + -0.128*"queen" + -0.117*"cliques"'), (5, '-0.439*"batman" + 0.233*"fallen" + 0.213*"house" + 0.199*"president" + 0.186*"banning" + -0.182*"reeves" + 0.156*"angel" + 0.131*"grace" + -0.119*"gotham" + 0.117*"barbie"'), (6, '0.433*"batman" + 0.182*"reeves" + 0.154*"barbie" + 0.152*"fallen" + 0.147*"house" + 0.129*"president" + -0.121*"black" + -0.121*"riddick" + -0.120*"people" + 0.120*"gotham"'), (7, '-0.245*"horror" + -0.218*"raimi" + -0.205*"christine" + 0.156*"cruise" + 0.153*"oblivion" + -0.146*"fresh" + 0.141*"house" + 0.136*"grace" + 0.134*"earth" + 0.128*"planet"'), (8, '-0.256*"loretta" + 0.218*"horror" + -0.178*"bullock" + -0.174*"romance" + 0.164*"raimi" + 0.147*"christine" + -0.130*"house" + -0.125*"grace" + -0.125*"action" + -0.115*"daniel"'), (9, '-0.239*"loretta" + 0.198*"action" + 0.186*"really" + -0.168*"bullock" + -0.167*"oblivion" + -0.156*"cruise" + 0.147*"inception" + -0.144*"romance" + 0.141*"french" + -0.139*"earth"'), (10, '0.399*"jason" + 0.316*"grandpa" + 0.286*"dirty" + 0.179*"efron" + 0.158*"grandson" + 0.131*"grandfather" + 0.119*"robert" + 0.114*"lawyer" + 0.108*"plaza" + 0.107*"kelly"'), (11, '0.185*"chapter" + 0.179*"characters" + -0.177*"fresh" + -0.169*"steve" + 0.169*"riddick" + -0.151*"women" + 0.148*"conjuring" + 0.132*"james" + -0.128*"dating" + 0.127*"character"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '-0.174*"would" + -0.170*"story" + -0.163*"action" + -0.151*"first" + -0.141*"world" + -0.139*"characters" + -0.135*"character" + -0.129*"years" + -0.127*"could" + -0.125*"still"'), (1, '-0.567*"holmes" + -0.384*"watson" + -0.273*"ferrell" + -0.240*"reilly" + -0.228*"sherlock" + -0.162*"comedy" + -0.124*"brothers" + -0.090*"jokes" + 0.088*"action" + -0.086*"moriarty"'), (2, '0.713*"barbie" + 0.167*"world" + 0.130*"girls" + 0.120*"school" + -0.119*"batman" + 0.115*"robbie" + 0.114*"women" + -0.092*"house" + 0.092*"daughter" + -0.091*"fallen"'), (3, '0.321*"fallen" + 0.255*"president" + -0.253*"house" + 0.235*"banning" + 0.226*"action" + 0.206*"angel" + -0.179*"horror" + -0.172*"grace" + -0.158*"children" + 0.126*"butler"'), (4, '0.450*"school" + 0.365*"girls" + -0.242*"barbie" + 0.177*"janis" + 0.167*"regina" + 0.158*"lohan" + -0.146*"batman" + 0.143*"plastics" + 0.128*"queen" + 0.117*"cliques"'), (5, '0.439*"batman" + -0.233*"fallen" + -0.213*"house" + -0.199*"president" + -0.186*"banning" + 0.182*"reeves" + -0.156*"angel" + -0.131*"grace" + 0.119*"gotham" + -0.117*"barbie"'), (6, '0.433*"batman" + 0.182*"reeves" + 0.154*"barbie" + 0.152*"fallen" + 0.147*"house" + 0.129*"president" + -0.121*"black" + -0.121*"riddick" + -0.120*"people" + 0.120*"gotham"'), (7, '-0.245*"horror" + -0.218*"raimi" + -0.205*"christine" + 0.156*"cruise" + 0.153*"oblivion" + -0.146*"fresh" + 0.141*"house" + 0.136*"grace" + 0.134*"earth" + 0.128*"planet"'), (8, '-0.256*"loretta" + 0.218*"horror" + -0.178*"bullock" + -0.174*"romance" + 0.164*"raimi" + 0.147*"christine" + -0.130*"house" + -0.125*"grace" + -0.125*"action" + -0.115*"daniel"'), (9, '-0.239*"loretta" + 0.198*"action" + 0.186*"really" + -0.168*"bullock" + -0.167*"oblivion" + -0.156*"cruise" + 0.147*"inception" + -0.144*"romance" + 0.141*"french" + -0.139*"earth"'), (10, '0.399*"jason" + 0.316*"grandpa" + 0.286*"dirty" + 0.179*"efron" + 0.158*"grandson" + 0.131*"grandfather" + 0.119*"robert" + 0.114*"lawyer" + 0.108*"plaza" + 0.107*"kelly"'), (11, '-0.185*"chapter" + -0.179*"characters" + 0.177*"fresh" + 0.169*"steve" + -0.169*"riddick" + 0.151*"women" + -0.148*"conjuring" + -0.132*"james" + 0.128*"dating" + -0.127*"character"'), (12, '-0.235*"riddick" + -0.225*"black" + 0.193*"chapter" + -0.188*"pitch" + 0.137*"james" + -0.132*"planet" + 0.121*"action" + 0.119*"pennywise" + 0.117*"inception" + 0.115*"nolan"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '-0.174*"would" + -0.170*"story" + -0.163*"action" + -0.151*"first" + -0.141*"world" + -0.139*"characters" + -0.135*"character" + -0.129*"years" + -0.127*"could" + -0.125*"still"'), (1, '0.567*"holmes" + 0.384*"watson" + 0.273*"ferrell" + 0.240*"reilly" + 0.228*"sherlock" + 0.162*"comedy" + 0.124*"brothers" + 0.090*"jokes" + -0.088*"action" + 0.086*"moriarty"'), (2, '0.713*"barbie" + 0.167*"world" + 0.130*"girls" + 0.120*"school" + -0.119*"batman" + 0.115*"robbie" + 0.114*"women" + -0.092*"house" + 0.092*"daughter" + -0.091*"fallen"'), (3, '0.321*"fallen" + 0.255*"president" + -0.253*"house" + 0.235*"banning" + 0.226*"action" + 0.206*"angel" + -0.179*"horror" + -0.172*"grace" + -0.158*"children" + 0.126*"butler"'), (4, '-0.450*"school" + -0.365*"girls" + 0.242*"barbie" + -0.177*"janis" + -0.167*"regina" + -0.158*"lohan" + 0.146*"batman" + -0.143*"plastics" + -0.128*"queen" + -0.117*"cliques"'), (5, '0.439*"batman" + -0.233*"fallen" + -0.213*"house" + -0.199*"president" + -0.186*"banning" + 0.182*"reeves" + -0.156*"angel" + -0.131*"grace" + 0.119*"gotham" + -0.117*"barbie"'), (6, '-0.433*"batman" + -0.182*"reeves" + -0.154*"barbie" + -0.152*"fallen" + -0.147*"house" + -0.129*"president" + 0.121*"black" + 0.121*"riddick" + 0.120*"people" + -0.120*"gotham"'), (7, '0.245*"horror" + 0.218*"raimi" + 0.205*"christine" + -0.156*"cruise" + -0.153*"oblivion" + 0.146*"fresh" + -0.141*"house" + -0.136*"grace" + -0.134*"earth" + -0.128*"planet"'), (8, '0.256*"loretta" + -0.218*"horror" + 0.178*"bullock" + 0.174*"romance" + -0.164*"raimi" + -0.147*"christine" + 0.130*"house" + 0.125*"grace" + 0.125*"action" + 0.115*"daniel"'), (9, '-0.239*"loretta" + 0.198*"action" + 0.186*"really" + -0.168*"bullock" + -0.167*"oblivion" + -0.156*"cruise" + 0.147*"inception" + -0.144*"romance" + 0.141*"french" + -0.139*"earth"'), (10, '-0.399*"jason" + -0.316*"grandpa" + -0.286*"dirty" + -0.179*"efron" + -0.158*"grandson" + -0.131*"grandfather" + -0.119*"robert" + -0.114*"lawyer" + -0.108*"plaza" + -0.107*"kelly"'), (11, '-0.185*"chapter" + -0.179*"characters" + 0.177*"fresh" + 0.169*"steve" + -0.169*"riddick" + 0.151*"women" + -0.148*"conjuring" + -0.132*"james" + 0.128*"dating" + -0.127*"character"'), (12, '0.235*"riddick" + 0.225*"black" + -0.193*"chapter" + 0.188*"pitch" + -0.137*"james" + 0.132*"planet" + -0.121*"action" + -0.119*"pennywise" + -0.117*"inception" + -0.115*"nolan"'), (13, '-0.292*"raimi" + 0.286*"fresh" + -0.280*"christine" + 0.255*"steve" + 0.193*"dating" + 0.167*"women" + -0.136*"lohman" + -0.119*"gypsy" + 0.117*"edgarjones" + -0.116*"would"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '-0.174*"would" + -0.170*"story" + -0.163*"action" + -0.151*"first" + -0.141*"world" + -0.139*"characters" + -0.135*"character" + -0.129*"years" + -0.127*"could" + -0.125*"still"'), (1, '0.567*"holmes" + 0.384*"watson" + 0.273*"ferrell" + 0.240*"reilly" + 0.228*"sherlock" + 0.162*"comedy" + 0.124*"brothers" + 0.090*"jokes" + -0.088*"action" + 0.086*"moriarty"'), (2, '-0.713*"barbie" + -0.167*"world" + -0.130*"girls" + -0.120*"school" + 0.119*"batman" + -0.115*"robbie" + -0.114*"women" + 0.092*"house" + -0.092*"daughter" + 0.091*"fallen"'), (3, '0.321*"fallen" + 0.255*"president" + -0.253*"house" + 0.235*"banning" + 0.226*"action" + 0.206*"angel" + -0.179*"horror" + -0.172*"grace" + -0.158*"children" + 0.126*"butler"'), (4, '0.450*"school" + 0.365*"girls" + -0.242*"barbie" + 0.177*"janis" + 0.167*"regina" + 0.158*"lohan" + -0.146*"batman" + 0.143*"plastics" + 0.128*"queen" + 0.117*"cliques"'), (5, '-0.439*"batman" + 0.233*"fallen" + 0.213*"house" + 0.199*"president" + 0.186*"banning" + -0.182*"reeves" + 0.156*"angel" + 0.131*"grace" + -0.119*"gotham" + 0.117*"barbie"'), (6, '0.433*"batman" + 0.182*"reeves" + 0.154*"barbie" + 0.152*"fallen" + 0.147*"house" + 0.129*"president" + -0.121*"black" + -0.121*"riddick" + -0.120*"people" + 0.120*"gotham"'), (7, '-0.245*"horror" + -0.218*"raimi" + -0.205*"christine" + 0.156*"cruise" + 0.153*"oblivion" + -0.146*"fresh" + 0.141*"house" + 0.136*"grace" + 0.134*"earth" + 0.128*"planet"'), (8, '0.256*"loretta" + -0.218*"horror" + 0.178*"bullock" + 0.174*"romance" + -0.164*"raimi" + -0.147*"christine" + 0.130*"house" + 0.125*"grace" + 0.125*"action" + 0.115*"daniel"'), (9, '-0.239*"loretta" + 0.198*"action" + 0.186*"really" + -0.168*"bullock" + -0.167*"oblivion" + -0.156*"cruise" + 0.147*"inception" + -0.144*"romance" + 0.141*"french" + -0.139*"earth"'), (10, '0.399*"jason" + 0.316*"grandpa" + 0.286*"dirty" + 0.179*"efron" + 0.158*"grandson" + 0.131*"grandfather" + 0.119*"robert" + 0.114*"lawyer" + 0.108*"plaza" + 0.107*"kelly"'), (11, '-0.185*"chapter" + -0.179*"characters" + 0.177*"fresh" + 0.169*"steve" + -0.169*"riddick" + 0.151*"women" + -0.148*"conjuring" + -0.132*"james" + 0.128*"dating" + -0.127*"character"'), (12, '0.235*"riddick" + 0.225*"black" + -0.193*"chapter" + 0.188*"pitch" + -0.137*"james" + 0.132*"planet" + -0.121*"action" + -0.119*"pennywise" + -0.117*"inception" + -0.115*"nolan"'), (13, '-0.292*"raimi" + 0.286*"fresh" + -0.280*"christine" + 0.255*"steve" + 0.193*"dating" + 0.167*"women" + -0.136*"lohman" + -0.119*"gypsy" + 0.117*"edgarjones" + -0.116*"would"'), (14, '-0.327*"conjuring" + 0.252*"chapter" + -0.196*"lorraine" + 0.148*"pennywise" + 0.140*"grace" + -0.129*"warrens" + -0.121*"janet" + -0.118*"farmiga" + -0.113*"hodgson" + 0.104*"muschietti"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '-0.174*"would" + -0.170*"story" + -0.163*"action" + -0.151*"first" + -0.141*"world" + -0.139*"characters" + -0.135*"character" + -0.129*"years" + -0.127*"could" + -0.125*"still"'), (1, '0.567*"holmes" + 0.384*"watson" + 0.273*"ferrell" + 0.240*"reilly" + 0.228*"sherlock" + 0.162*"comedy" + 0.124*"brothers" + 0.090*"jokes" + -0.088*"action" + 0.086*"moriarty"'), (2, '0.713*"barbie" + 0.167*"world" + 0.130*"girls" + 0.120*"school" + -0.119*"batman" + 0.115*"robbie" + 0.114*"women" + -0.092*"house" + 0.092*"daughter" + -0.091*"fallen"'), (3, '-0.321*"fallen" + -0.255*"president" + 0.253*"house" + -0.235*"banning" + -0.226*"action" + -0.206*"angel" + 0.179*"horror" + 0.172*"grace" + 0.158*"children" + -0.126*"butler"'), (4, '-0.450*"school" + -0.365*"girls" + 0.242*"barbie" + -0.177*"janis" + -0.167*"regina" + -0.158*"lohan" + 0.146*"batman" + -0.143*"plastics" + -0.128*"queen" + -0.117*"cliques"'), (5, '0.439*"batman" + -0.233*"fallen" + -0.213*"house" + -0.199*"president" + -0.186*"banning" + 0.182*"reeves" + -0.156*"angel" + -0.131*"grace" + 0.119*"gotham" + -0.117*"barbie"'), (6, '-0.433*"batman" + -0.182*"reeves" + -0.154*"barbie" + -0.152*"fallen" + -0.147*"house" + -0.129*"president" + 0.121*"black" + 0.121*"riddick" + 0.120*"people" + -0.120*"gotham"'), (7, '-0.245*"horror" + -0.218*"raimi" + -0.205*"christine" + 0.156*"cruise" + 0.153*"oblivion" + -0.146*"fresh" + 0.141*"house" + 0.136*"grace" + 0.134*"earth" + 0.128*"planet"'), (8, '-0.256*"loretta" + 0.218*"horror" + -0.178*"bullock" + -0.174*"romance" + 0.164*"raimi" + 0.147*"christine" + -0.130*"house" + -0.125*"grace" + -0.125*"action" + -0.115*"daniel"'), (9, '-0.239*"loretta" + 0.198*"action" + 0.186*"really" + -0.168*"bullock" + -0.167*"oblivion" + -0.156*"cruise" + 0.147*"inception" + -0.144*"romance" + 0.141*"french" + -0.139*"earth"'), (10, '-0.399*"jason" + -0.316*"grandpa" + -0.286*"dirty" + -0.179*"efron" + -0.158*"grandson" + -0.131*"grandfather" + -0.119*"robert" + -0.114*"lawyer" + -0.108*"plaza" + -0.107*"kelly"'), (11, '-0.185*"chapter" + -0.179*"characters" + 0.177*"fresh" + 0.169*"steve" + -0.169*"riddick" + 0.151*"women" + -0.148*"conjuring" + -0.132*"james" + 0.128*"dating" + -0.127*"character"'), (12, '-0.235*"riddick" + -0.225*"black" + 0.193*"chapter" + -0.188*"pitch" + 0.137*"james" + -0.132*"planet" + 0.121*"action" + 0.119*"pennywise" + 0.117*"inception" + 0.115*"nolan"'), (13, '-0.292*"raimi" + 0.286*"fresh" + -0.280*"christine" + 0.255*"steve" + 0.193*"dating" + 0.167*"women" + -0.136*"lohman" + -0.119*"gypsy" + 0.117*"edgarjones" + -0.116*"would"'), (14, '0.327*"conjuring" + -0.252*"chapter" + 0.196*"lorraine" + -0.148*"pennywise" + -0.140*"grace" + 0.129*"warrens" + 0.121*"janet" + 0.118*"farmiga" + 0.113*"hodgson" + -0.104*"muschietti"'), (15, '0.319*"preston" + 0.257*"emotions" + 0.189*"equilibrium" + -0.167*"inception" + -0.160*"nolan" + -0.130*"oblivion" + 0.129*"world" + 0.120*"called" + 0.119*"christian" + -0.115*"story"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '-0.174*"would" + -0.170*"story" + -0.163*"action" + -0.151*"first" + -0.141*"world" + -0.139*"characters" + -0.135*"character" + -0.129*"years" + -0.127*"could" + -0.125*"still"'), (1, '-0.567*"holmes" + -0.384*"watson" + -0.273*"ferrell" + -0.240*"reilly" + -0.228*"sherlock" + -0.162*"comedy" + -0.124*"brothers" + -0.090*"jokes" + 0.088*"action" + -0.086*"moriarty"'), (2, '-0.713*"barbie" + -0.167*"world" + -0.130*"girls" + -0.120*"school" + 0.119*"batman" + -0.115*"robbie" + -0.114*"women" + 0.092*"house" + -0.092*"daughter" + 0.091*"fallen"'), (3, '0.321*"fallen" + 0.255*"president" + -0.253*"house" + 0.235*"banning" + 0.226*"action" + 0.206*"angel" + -0.179*"horror" + -0.172*"grace" + -0.158*"children" + 0.126*"butler"'), (4, '0.450*"school" + 0.365*"girls" + -0.242*"barbie" + 0.177*"janis" + 0.167*"regina" + 0.158*"lohan" + -0.146*"batman" + 0.143*"plastics" + 0.128*"queen" + 0.117*"cliques"'), (5, '0.439*"batman" + -0.233*"fallen" + -0.213*"house" + -0.199*"president" + -0.186*"banning" + 0.182*"reeves" + -0.156*"angel" + -0.131*"grace" + 0.119*"gotham" + -0.117*"barbie"'), (6, '-0.433*"batman" + -0.182*"reeves" + -0.154*"barbie" + -0.152*"fallen" + -0.147*"house" + -0.129*"president" + 0.121*"black" + 0.121*"riddick" + 0.120*"people" + -0.120*"gotham"'), (7, '0.245*"horror" + 0.218*"raimi" + 0.205*"christine" + -0.156*"cruise" + -0.153*"oblivion" + 0.146*"fresh" + -0.141*"house" + -0.136*"grace" + -0.134*"earth" + -0.128*"planet"'), (8, '0.256*"loretta" + -0.218*"horror" + 0.178*"bullock" + 0.174*"romance" + -0.164*"raimi" + -0.147*"christine" + 0.130*"house" + 0.125*"grace" + 0.125*"action" + 0.115*"daniel"'), (9, '-0.239*"loretta" + 0.198*"action" + 0.186*"really" + -0.168*"bullock" + -0.167*"oblivion" + -0.156*"cruise" + 0.147*"inception" + -0.144*"romance" + 0.141*"french" + -0.139*"earth"'), (10, '-0.399*"jason" + -0.316*"grandpa" + -0.286*"dirty" + -0.179*"efron" + -0.158*"grandson" + -0.131*"grandfather" + -0.119*"robert" + -0.114*"lawyer" + -0.108*"plaza" + -0.107*"kelly"'), (11, '-0.185*"chapter" + -0.179*"characters" + 0.177*"fresh" + 0.169*"steve" + -0.169*"riddick" + 0.151*"women" + -0.148*"conjuring" + -0.132*"james" + 0.128*"dating" + -0.127*"character"'), (12, '-0.235*"riddick" + -0.225*"black" + 0.193*"chapter" + -0.188*"pitch" + 0.137*"james" + -0.132*"planet" + 0.121*"action" + 0.119*"pennywise" + 0.117*"inception" + 0.115*"nolan"'), (13, '0.292*"raimi" + -0.286*"fresh" + 0.280*"christine" + -0.255*"steve" + -0.193*"dating" + -0.167*"women" + 0.136*"lohman" + 0.119*"gypsy" + -0.117*"edgarjones" + 0.116*"would"'), (14, '-0.327*"conjuring" + 0.252*"chapter" + -0.196*"lorraine" + 0.148*"pennywise" + 0.140*"grace" + -0.129*"warrens" + -0.121*"janet" + -0.118*"farmiga" + -0.113*"hodgson" + 0.104*"muschietti"'), (15, '0.319*"preston" + 0.257*"emotions" + 0.189*"equilibrium" + -0.167*"inception" + -0.160*"nolan" + -0.130*"oblivion" + 0.129*"world" + 0.120*"called" + 0.119*"christian" + -0.115*"story"'), (16, '0.343*"craig" + -0.204*"anderton" + -0.170*"story" + -0.167*"spielberg" + 0.163*"james" + -0.158*"would" + 0.152*"daniel" + 0.135*"great" + -0.123*"report" + 0.123*"madeleine"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '-0.174*"would" + -0.170*"story" + -0.163*"action" + -0.151*"first" + -0.141*"world" + -0.139*"characters" + -0.135*"character" + -0.129*"years" + -0.127*"could" + -0.125*"still"'), (1, '-0.567*"holmes" + -0.384*"watson" + -0.273*"ferrell" + -0.240*"reilly" + -0.228*"sherlock" + -0.162*"comedy" + -0.124*"brothers" + -0.090*"jokes" + 0.088*"action" + -0.086*"moriarty"'), (2, '0.713*"barbie" + 0.167*"world" + 0.130*"girls" + 0.120*"school" + -0.119*"batman" + 0.115*"robbie" + 0.114*"women" + -0.092*"house" + 0.092*"daughter" + -0.091*"fallen"'), (3, '0.321*"fallen" + 0.255*"president" + -0.253*"house" + 0.235*"banning" + 0.226*"action" + 0.206*"angel" + -0.179*"horror" + -0.172*"grace" + -0.158*"children" + 0.126*"butler"'), (4, '-0.450*"school" + -0.365*"girls" + 0.242*"barbie" + -0.177*"janis" + -0.167*"regina" + -0.158*"lohan" + 0.146*"batman" + -0.143*"plastics" + -0.128*"queen" + -0.117*"cliques"'), (5, '-0.439*"batman" + 0.233*"fallen" + 0.213*"house" + 0.199*"president" + 0.186*"banning" + -0.182*"reeves" + 0.156*"angel" + 0.131*"grace" + -0.119*"gotham" + 0.117*"barbie"'), (6, '0.433*"batman" + 0.182*"reeves" + 0.154*"barbie" + 0.152*"fallen" + 0.147*"house" + 0.129*"president" + -0.121*"black" + -0.121*"riddick" + -0.120*"people" + 0.120*"gotham"'), (7, '-0.245*"horror" + -0.218*"raimi" + -0.205*"christine" + 0.156*"cruise" + 0.153*"oblivion" + -0.146*"fresh" + 0.141*"house" + 0.136*"grace" + 0.134*"earth" + 0.128*"planet"'), (8, '-0.256*"loretta" + 0.218*"horror" + -0.178*"bullock" + -0.174*"romance" + 0.164*"raimi" + 0.147*"christine" + -0.130*"house" + -0.125*"grace" + -0.125*"action" + -0.115*"daniel"'), (9, '0.239*"loretta" + -0.198*"action" + -0.186*"really" + 0.168*"bullock" + 0.167*"oblivion" + 0.156*"cruise" + -0.147*"inception" + 0.144*"romance" + -0.141*"french" + 0.139*"earth"'), (10, '-0.399*"jason" + -0.316*"grandpa" + -0.286*"dirty" + -0.179*"efron" + -0.158*"grandson" + -0.131*"grandfather" + -0.119*"robert" + -0.114*"lawyer" + -0.108*"plaza" + -0.107*"kelly"'), (11, '-0.185*"chapter" + -0.179*"characters" + 0.177*"fresh" + 0.169*"steve" + -0.169*"riddick" + 0.151*"women" + -0.148*"conjuring" + -0.132*"james" + 0.128*"dating" + -0.127*"character"'), (12, '0.235*"riddick" + 0.225*"black" + -0.193*"chapter" + 0.188*"pitch" + -0.137*"james" + 0.132*"planet" + -0.121*"action" + -0.119*"pennywise" + -0.117*"inception" + -0.115*"nolan"'), (13, '0.292*"raimi" + -0.286*"fresh" + 0.280*"christine" + -0.255*"steve" + -0.193*"dating" + -0.167*"women" + 0.136*"lohman" + 0.119*"gypsy" + -0.117*"edgarjones" + 0.116*"would"'), (14, '-0.327*"conjuring" + 0.252*"chapter" + -0.196*"lorraine" + 0.148*"pennywise" + 0.140*"grace" + -0.129*"warrens" + -0.121*"janet" + -0.118*"farmiga" + -0.113*"hodgson" + 0.104*"muschietti"'), (15, '0.319*"preston" + 0.257*"emotions" + 0.189*"equilibrium" + -0.167*"inception" + -0.160*"nolan" + -0.130*"oblivion" + 0.129*"world" + 0.120*"called" + 0.119*"christian" + -0.115*"story"'), (16, '-0.343*"craig" + 0.204*"anderton" + 0.170*"story" + 0.167*"spielberg" + -0.163*"james" + 0.158*"would" + -0.152*"daniel" + -0.135*"great" + 0.123*"report" + -0.123*"madeleine"'), (17, '-0.315*"craig" + -0.202*"would" + -0.201*"anderton" + -0.177*"spielberg" + -0.143*"character" + -0.142*"report" + -0.140*"minority" + 0.134*"action" + 0.133*"great" + -0.132*"james"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '-0.174*"would" + -0.170*"story" + -0.163*"action" + -0.151*"first" + -0.141*"world" + -0.139*"characters" + -0.135*"character" + -0.129*"years" + -0.127*"could" + -0.125*"still"'), (1, '0.567*"holmes" + 0.384*"watson" + 0.273*"ferrell" + 0.240*"reilly" + 0.228*"sherlock" + 0.162*"comedy" + 0.124*"brothers" + 0.090*"jokes" + -0.088*"action" + 0.086*"moriarty"'), (2, '-0.713*"barbie" + -0.167*"world" + -0.130*"girls" + -0.120*"school" + 0.119*"batman" + -0.115*"robbie" + -0.114*"women" + 0.092*"house" + -0.092*"daughter" + 0.091*"fallen"'), (3, '-0.321*"fallen" + -0.255*"president" + 0.253*"house" + -0.235*"banning" + -0.226*"action" + -0.206*"angel" + 0.179*"horror" + 0.172*"grace" + 0.158*"children" + -0.126*"butler"'), (4, '-0.450*"school" + -0.365*"girls" + 0.242*"barbie" + -0.177*"janis" + -0.167*"regina" + -0.158*"lohan" + 0.146*"batman" + -0.143*"plastics" + -0.128*"queen" + -0.117*"cliques"'), (5, '-0.439*"batman" + 0.233*"fallen" + 0.213*"house" + 0.199*"president" + 0.186*"banning" + -0.182*"reeves" + 0.156*"angel" + 0.131*"grace" + -0.119*"gotham" + 0.117*"barbie"'), (6, '0.433*"batman" + 0.182*"reeves" + 0.154*"barbie" + 0.152*"fallen" + 0.147*"house" + 0.129*"president" + -0.121*"black" + -0.121*"riddick" + -0.120*"people" + 0.120*"gotham"'), (7, '-0.245*"horror" + -0.218*"raimi" + -0.205*"christine" + 0.156*"cruise" + 0.153*"oblivion" + -0.146*"fresh" + 0.141*"house" + 0.136*"grace" + 0.134*"earth" + 0.128*"planet"'), (8, '0.256*"loretta" + -0.218*"horror" + 0.178*"bullock" + 0.174*"romance" + -0.164*"raimi" + -0.147*"christine" + 0.130*"house" + 0.125*"grace" + 0.125*"action" + 0.115*"daniel"'), (9, '-0.239*"loretta" + 0.198*"action" + 0.186*"really" + -0.168*"bullock" + -0.167*"oblivion" + -0.156*"cruise" + 0.147*"inception" + -0.144*"romance" + 0.141*"french" + -0.139*"earth"'), (10, '0.399*"jason" + 0.316*"grandpa" + 0.286*"dirty" + 0.179*"efron" + 0.158*"grandson" + 0.131*"grandfather" + 0.119*"robert" + 0.114*"lawyer" + 0.108*"plaza" + 0.107*"kelly"'), (11, '-0.185*"chapter" + -0.179*"characters" + 0.177*"fresh" + 0.169*"steve" + -0.169*"riddick" + 0.151*"women" + -0.148*"conjuring" + -0.132*"james" + 0.128*"dating" + -0.127*"character"'), (12, '0.235*"riddick" + 0.225*"black" + -0.193*"chapter" + 0.188*"pitch" + -0.137*"james" + 0.132*"planet" + -0.121*"action" + -0.119*"pennywise" + -0.117*"inception" + -0.115*"nolan"'), (13, '-0.292*"raimi" + 0.286*"fresh" + -0.280*"christine" + 0.255*"steve" + 0.193*"dating" + 0.167*"women" + -0.136*"lohman" + -0.119*"gypsy" + 0.117*"edgarjones" + -0.116*"would"'), (14, '0.327*"conjuring" + -0.252*"chapter" + 0.196*"lorraine" + -0.148*"pennywise" + -0.140*"grace" + 0.129*"warrens" + 0.121*"janet" + 0.118*"farmiga" + 0.113*"hodgson" + -0.104*"muschietti"'), (15, '0.319*"preston" + 0.257*"emotions" + 0.189*"equilibrium" + -0.167*"inception" + -0.160*"nolan" + -0.130*"oblivion" + 0.129*"world" + 0.120*"called" + 0.119*"christian" + -0.115*"story"'), (16, '0.343*"craig" + -0.204*"anderton" + -0.170*"story" + -0.167*"spielberg" + 0.163*"james" + -0.158*"would" + 0.152*"daniel" + 0.135*"great" + -0.123*"report" + 0.123*"madeleine"'), (17, '0.315*"craig" + 0.202*"would" + 0.201*"anderton" + 0.177*"spielberg" + 0.143*"character" + 0.142*"report" + 0.140*"minority" + -0.134*"action" + -0.133*"great" + 0.132*"james"'), (18, '0.364*"inception" + 0.316*"nolan" + -0.189*"action" + -0.175*"french" + -0.154*"daughter" + 0.145*"characters" + 0.129*"character" + 0.121*"emotion" + 0.120*"preston" + -0.108*"people"')]
{'2 concepts 10 words': 0.4836655647591568, '3 concepts 10 words': 0.4266172342686607, '4 concepts 10 words': 0.37635176667234266, '5 concepts 10 words': 0.4542133873456625, '6 concepts 10 words': 0.3827877167641211, '7 concepts 10 words': 0.5048815185636452, '8 concepts 10 words': 0.42953528081674386, '9 concepts 10 words': 0.3664118979022236, '10 concepts 10 words': 0.48864933251100773, '11 concepts 10 words': 0.5457376933177224, '12 concepts 10 words': 0.4813480864580224, '13 concepts 10 words': 0.4937346307261734, '14 concepts 10 words': 0.47651985146462084, '15 concepts 10 words': 0.5239296744797176, '16 concepts 10 words': 0.5074461015344318, '17 concepts 10 words': 0.48915343397667455, '18 concepts 10 words': 0.45506020909849787, '19 concepts 10 words': 0.5009660435816203}
model_7concepts_10words=plot_lsa(7, 10)
Output hidden; open in https://colab.research.google.com to view.
######################################
#EXPERIMENT WITH THESE PARAMETERS
number_of_topics=4
words=20
#####################################
# BARRYC EXPERIMENTAL#1
#model2,dictionary2,index2,doctermmatrix2=create_gensim_lda_model(processed_text,number_of_topics,words,titles)
model2,dictionary2,index2,doctermmatrix2=create_gensim_lda_model(processed_text,number_of_topics, words)
for doc in processed_text:
vec_bow2 = dictionary2.doc2bow(doc)
vec2 = model2[vec_bow2] # convert the query to embedded space
sims2 = index2[vec2] # perform a similarity query against the corpus
#print(list(enumerate(sims2)))
fig, ax = plt.subplots(figsize=(30, 10))
cax = ax.matshow(index2, interpolation='nearest')
ax.grid(True)
plt.xticks(range(len(processed_text)), titles, rotation=90);
plt.yticks(range(len(processed_text)), titles);
fig.colorbar(cax)
plt.show()
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.004*"house" + 0.003*"character" + 0.003*"characters" + 0.003*"could" + 0.003*"grace" + 0.003*"people" + 0.002*"story" + 0.002*"fallen" + 0.002*"banning" + 0.002*"children" + 0.002*"action" + 0.002*"first" + 0.002*"school" + 0.002*"director" + 0.002*"president" + 0.002*"really" + 0.002*"nolan" + 0.002*"around" + 0.002*"kidman" + 0.002*"horror"'), (1, '0.006*"barbie" + 0.004*"holmes" + 0.004*"would" + 0.003*"jason" + 0.003*"first" + 0.003*"comedy" + 0.003*"story" + 0.002*"watson" + 0.002*"character" + 0.002*"steve" + 0.002*"every" + 0.002*"world" + 0.002*"could" + 0.002*"still" + 0.002*"actually" + 0.002*"going" + 0.002*"characters" + 0.002*"someone" + 0.002*"never" + 0.002*"years"'), (2, '0.005*"batman" + 0.004*"story" + 0.004*"would" + 0.003*"action" + 0.003*"years" + 0.003*"first" + 0.002*"still" + 0.002*"played" + 0.002*"holmes" + 0.002*"reeves" + 0.002*"french" + 0.002*"world" + 0.002*"watson" + 0.002*"character" + 0.002*"loretta" + 0.002*"something" + 0.002*"characters" + 0.002*"great" + 0.002*"daniel" + 0.002*"better"'), (3, '0.004*"action" + 0.003*"world" + 0.003*"horror" + 0.003*"women" + 0.003*"first" + 0.003*"girls" + 0.003*"really" + 0.003*"would" + 0.003*"story" + 0.003*"christine" + 0.002*"years" + 0.002*"could" + 0.002*"raimi" + 0.002*"never" + 0.002*"still" + 0.002*"young" + 0.002*"school" + 0.002*"something" + 0.002*"characters" + 0.002*"people"')]
topics = [i for i in range(2,21)]
coherence_values = []
for t in topics:
#BARRYC EXPERIMENTAL#1
# ldamodel,dictionary,index, matrix = create_gensim_lda_model(processed_text,t,10,titles)
ldamodel,dictionary,index, matrix = create_gensim_lda_model(processed_text,t,10)
coherence_model_lda = CoherenceModel(model=ldamodel, dictionary=dictionary, texts=processed_text, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
coherence_values.append(coherence_lda)
coherence = {f"{i} topics 10 words": coherence_values[i-2] for i in topics}
print(coherence)
WARNING:gensim.models.ldamodel:updated prior is not positive WARNING:gensim.models.ldamodel:updated prior is not positive WARNING:gensim.models.ldamodel:updated prior is not positive WARNING:gensim.models.ldamodel:updated prior is not positive WARNING:gensim.models.ldamodel:updated prior is not positive WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.003*"action" + 0.003*"story" + 0.003*"horror" + 0.003*"house" + 0.003*"first" + 0.003*"characters" + 0.002*"character" + 0.002*"school" + 0.002*"years" + 0.002*"still"'), (1, '0.004*"would" + 0.003*"barbie" + 0.003*"holmes" + 0.003*"world" + 0.003*"first" + 0.002*"story" + 0.002*"watson" + 0.002*"comedy" + 0.002*"action" + 0.002*"years"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.004*"house" + 0.003*"horror" + 0.003*"story" + 0.003*"characters" + 0.003*"could" + 0.003*"raimi" + 0.003*"character" + 0.003*"really" + 0.002*"first" + 0.002*"christine"'), (1, '0.005*"barbie" + 0.004*"would" + 0.003*"world" + 0.003*"holmes" + 0.003*"action" + 0.003*"first" + 0.002*"story" + 0.002*"years" + 0.002*"comedy" + 0.002*"steve"'), (2, '0.004*"batman" + 0.003*"action" + 0.003*"would" + 0.003*"story" + 0.003*"years" + 0.003*"first" + 0.003*"still" + 0.002*"people" + 0.002*"world" + 0.002*"character"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.004*"house" + 0.003*"character" + 0.003*"characters" + 0.003*"could" + 0.003*"grace" + 0.003*"people" + 0.002*"story" + 0.002*"fallen" + 0.002*"banning" + 0.002*"children"'), (1, '0.006*"barbie" + 0.004*"holmes" + 0.004*"would" + 0.003*"jason" + 0.003*"first" + 0.003*"comedy" + 0.003*"story" + 0.002*"watson" + 0.002*"character" + 0.002*"steve"'), (2, '0.005*"batman" + 0.004*"story" + 0.004*"would" + 0.003*"action" + 0.003*"years" + 0.003*"first" + 0.002*"still" + 0.002*"played" + 0.002*"holmes" + 0.002*"reeves"'), (3, '0.004*"action" + 0.003*"world" + 0.003*"horror" + 0.003*"women" + 0.003*"first" + 0.003*"girls" + 0.003*"really" + 0.003*"would" + 0.003*"story" + 0.003*"christine"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.004*"house" + 0.004*"character" + 0.003*"characters" + 0.003*"really" + 0.003*"banning" + 0.003*"fallen" + 0.003*"president" + 0.003*"people" + 0.003*"story" + 0.003*"grace"'), (1, '0.006*"barbie" + 0.005*"holmes" + 0.004*"would" + 0.003*"watson" + 0.003*"story" + 0.003*"first" + 0.003*"comedy" + 0.003*"jason" + 0.002*"could" + 0.002*"every"'), (2, '0.005*"batman" + 0.004*"would" + 0.004*"action" + 0.003*"story" + 0.003*"years" + 0.003*"still" + 0.002*"played" + 0.002*"first" + 0.002*"french" + 0.002*"great"'), (3, '0.005*"action" + 0.004*"world" + 0.003*"christine" + 0.003*"horror" + 0.003*"story" + 0.003*"years" + 0.003*"women" + 0.003*"first" + 0.003*"raimi" + 0.003*"still"'), (4, '0.006*"girls" + 0.005*"school" + 0.003*"batman" + 0.003*"janis" + 0.002*"fresh" + 0.002*"regina" + 0.002*"first" + 0.002*"young" + 0.002*"world" + 0.002*"plastics"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.005*"house" + 0.004*"character" + 0.003*"grace" + 0.003*"characters" + 0.003*"could" + 0.002*"always" + 0.002*"banning" + 0.002*"story" + 0.002*"director" + 0.002*"first"'), (1, '0.007*"barbie" + 0.007*"holmes" + 0.006*"would" + 0.004*"watson" + 0.003*"first" + 0.003*"comedy" + 0.003*"jason" + 0.003*"story" + 0.003*"loretta" + 0.003*"ferrell"'), (2, '0.006*"batman" + 0.004*"action" + 0.003*"played" + 0.003*"character" + 0.003*"would" + 0.003*"story" + 0.003*"years" + 0.003*"reeves" + 0.003*"great" + 0.002*"world"'), (3, '0.004*"action" + 0.004*"world" + 0.003*"christine" + 0.003*"years" + 0.003*"horror" + 0.003*"story" + 0.003*"women" + 0.003*"first" + 0.003*"could" + 0.003*"never"'), (4, '0.007*"girls" + 0.006*"school" + 0.005*"fresh" + 0.004*"steve" + 0.003*"batman" + 0.003*"dating" + 0.003*"janis" + 0.003*"first" + 0.003*"regina" + 0.003*"would"'), (5, '0.004*"action" + 0.003*"conjuring" + 0.003*"characters" + 0.003*"really" + 0.003*"story" + 0.003*"fallen" + 0.003*"house" + 0.003*"french" + 0.003*"still" + 0.003*"something"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.004*"school" + 0.004*"character" + 0.003*"inception" + 0.003*"banning" + 0.003*"first" + 0.003*"president" + 0.002*"could" + 0.002*"never" + 0.002*"characters" + 0.002*"played"'), (1, '0.007*"holmes" + 0.006*"barbie" + 0.005*"watson" + 0.004*"would" + 0.003*"jason" + 0.003*"comedy" + 0.003*"first" + 0.003*"story" + 0.003*"ferrell" + 0.003*"character"'), (2, '0.007*"batman" + 0.004*"would" + 0.004*"action" + 0.003*"played" + 0.003*"years" + 0.003*"loretta" + 0.003*"character" + 0.003*"reeves" + 0.003*"story" + 0.003*"world"'), (3, '0.005*"action" + 0.004*"world" + 0.004*"could" + 0.003*"story" + 0.003*"years" + 0.003*"women" + 0.003*"first" + 0.003*"never" + 0.003*"people" + 0.003*"still"'), (4, '0.006*"girls" + 0.005*"school" + 0.004*"batman" + 0.003*"fresh" + 0.003*"regina" + 0.003*"janis" + 0.003*"steve" + 0.003*"world" + 0.003*"comes" + 0.002*"first"'), (5, '0.004*"action" + 0.004*"fallen" + 0.003*"french" + 0.003*"really" + 0.003*"barbie" + 0.003*"people" + 0.003*"characters" + 0.003*"something" + 0.003*"makes" + 0.002*"story"'), (6, '0.007*"house" + 0.005*"horror" + 0.004*"grace" + 0.004*"children" + 0.004*"story" + 0.004*"christine" + 0.004*"would" + 0.003*"raimi" + 0.003*"first" + 0.003*"young"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.004*"banning" + 0.004*"president" + 0.004*"character" + 0.003*"played" + 0.003*"house" + 0.003*"characters" + 0.003*"director" + 0.003*"first" + 0.002*"could" + 0.002*"story"'), (1, '0.009*"holmes" + 0.008*"barbie" + 0.006*"watson" + 0.005*"would" + 0.004*"ferrell" + 0.004*"comedy" + 0.004*"reilly" + 0.004*"sherlock" + 0.004*"first" + 0.003*"could"'), (2, '0.005*"batman" + 0.005*"action" + 0.004*"would" + 0.003*"years" + 0.003*"world" + 0.003*"story" + 0.003*"played" + 0.003*"character" + 0.003*"oblivion" + 0.003*"still"'), (3, '0.005*"action" + 0.004*"world" + 0.004*"years" + 0.003*"story" + 0.003*"first" + 0.003*"women" + 0.003*"could" + 0.003*"never" + 0.003*"people" + 0.002*"still"'), (4, '0.003*"fresh" + 0.003*"could" + 0.003*"place" + 0.002*"friend" + 0.002*"paris" + 0.002*"people" + 0.002*"women" + 0.002*"still" + 0.002*"going" + 0.002*"first"'), (5, '0.005*"action" + 0.004*"fallen" + 0.004*"really" + 0.004*"characters" + 0.003*"people" + 0.003*"would" + 0.003*"inception" + 0.003*"barbie" + 0.003*"still" + 0.003*"story"'), (6, '0.006*"horror" + 0.006*"house" + 0.005*"raimi" + 0.005*"christine" + 0.004*"would" + 0.003*"story" + 0.003*"first" + 0.003*"grace" + 0.003*"woman" + 0.003*"young"'), (7, '0.005*"school" + 0.005*"girls" + 0.003*"fresh" + 0.003*"would" + 0.003*"steve" + 0.003*"first" + 0.003*"batman" + 0.003*"world" + 0.002*"director" + 0.002*"horror"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.004*"banning" + 0.004*"president" + 0.004*"character" + 0.003*"house" + 0.003*"first" + 0.003*"another" + 0.003*"could" + 0.003*"played" + 0.003*"fallen" + 0.003*"franchise"'), (1, '0.008*"barbie" + 0.007*"holmes" + 0.005*"jason" + 0.005*"watson" + 0.004*"would" + 0.004*"comedy" + 0.003*"first" + 0.003*"story" + 0.003*"ferrell" + 0.003*"could"'), (2, '0.005*"action" + 0.004*"played" + 0.004*"batman" + 0.004*"would" + 0.003*"loretta" + 0.003*"world" + 0.003*"story" + 0.003*"holmes" + 0.003*"great" + 0.003*"earth"'), (3, '0.005*"action" + 0.004*"world" + 0.004*"people" + 0.003*"women" + 0.003*"years" + 0.003*"could" + 0.003*"story" + 0.003*"still" + 0.003*"barbie" + 0.003*"harper"'), (4, '0.005*"fresh" + 0.004*"batman" + 0.004*"steve" + 0.003*"could" + 0.003*"dating" + 0.003*"place" + 0.003*"women" + 0.003*"friend" + 0.003*"first" + 0.002*"paris"'), (5, '0.005*"fallen" + 0.005*"characters" + 0.004*"action" + 0.004*"people" + 0.004*"barbie" + 0.003*"still" + 0.003*"really" + 0.003*"something" + 0.003*"president" + 0.003*"angel"'), (6, '0.007*"horror" + 0.006*"house" + 0.006*"raimi" + 0.006*"christine" + 0.004*"story" + 0.004*"first" + 0.004*"grace" + 0.004*"kidman" + 0.003*"children" + 0.003*"young"'), (7, '0.003*"horror" + 0.003*"fresh" + 0.003*"would" + 0.003*"steve" + 0.003*"director" + 0.002*"story" + 0.002*"world" + 0.002*"school" + 0.002*"funny" + 0.002*"something"'), (8, '0.005*"school" + 0.004*"years" + 0.004*"girls" + 0.004*"would" + 0.003*"action" + 0.003*"never" + 0.003*"genre" + 0.003*"batman" + 0.003*"first" + 0.003*"people"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.004*"nolan" + 0.004*"house" + 0.004*"characters" + 0.003*"character" + 0.003*"inception" + 0.003*"grace" + 0.003*"could" + 0.003*"children" + 0.003*"never" + 0.003*"another"'), (1, '0.010*"holmes" + 0.009*"barbie" + 0.006*"watson" + 0.005*"would" + 0.004*"ferrell" + 0.004*"comedy" + 0.004*"sherlock" + 0.004*"reilly" + 0.004*"jason" + 0.003*"could"'), (2, '0.005*"action" + 0.004*"batman" + 0.004*"story" + 0.004*"earth" + 0.004*"oblivion" + 0.003*"would" + 0.003*"world" + 0.003*"cruise" + 0.003*"holmes" + 0.003*"great"'), (3, '0.004*"action" + 0.004*"world" + 0.004*"women" + 0.003*"story" + 0.003*"years" + 0.003*"first" + 0.003*"james" + 0.003*"craig" + 0.003*"horror" + 0.003*"harper"'), (4, '0.004*"batman" + 0.004*"fresh" + 0.003*"comes" + 0.003*"parents" + 0.003*"friend" + 0.003*"paris" + 0.003*"women" + 0.003*"people" + 0.003*"girls" + 0.003*"still"'), (5, '0.005*"barbie" + 0.004*"people" + 0.004*"characters" + 0.004*"really" + 0.004*"planet" + 0.003*"black" + 0.003*"pitch" + 0.003*"still" + 0.003*"story" + 0.002*"something"'), (6, '0.007*"house" + 0.005*"horror" + 0.004*"raimi" + 0.004*"first" + 0.003*"story" + 0.003*"children" + 0.003*"grace" + 0.003*"james" + 0.003*"conjuring" + 0.003*"young"'), (7, '0.004*"girls" + 0.004*"fresh" + 0.004*"school" + 0.004*"steve" + 0.003*"would" + 0.003*"really" + 0.003*"horror" + 0.003*"first" + 0.003*"story" + 0.002*"comedy"'), (8, '0.005*"school" + 0.004*"would" + 0.004*"years" + 0.003*"action" + 0.003*"genre" + 0.003*"first" + 0.003*"never" + 0.003*"story" + 0.003*"people" + 0.003*"almost"'), (9, '0.008*"fallen" + 0.008*"president" + 0.007*"banning" + 0.006*"angel" + 0.004*"loretta" + 0.004*"action" + 0.003*"butler" + 0.003*"played" + 0.003*"director" + 0.003*"olympus"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.004*"house" + 0.003*"president" + 0.003*"first" + 0.003*"another" + 0.003*"banning" + 0.003*"could" + 0.003*"grace" + 0.003*"nolan" + 0.003*"never" + 0.002*"character"'), (1, '0.009*"barbie" + 0.005*"jason" + 0.005*"holmes" + 0.004*"comedy" + 0.004*"watson" + 0.003*"would" + 0.003*"first" + 0.003*"could" + 0.003*"world" + 0.003*"story"'), (2, '0.006*"action" + 0.004*"oblivion" + 0.004*"earth" + 0.004*"would" + 0.004*"story" + 0.004*"neeson" + 0.003*"cruise" + 0.003*"great" + 0.003*"world" + 0.003*"planet"'), (3, '0.004*"action" + 0.004*"world" + 0.004*"women" + 0.004*"craig" + 0.003*"james" + 0.003*"story" + 0.003*"people" + 0.003*"first" + 0.003*"years" + 0.003*"horror"'), (4, '0.005*"batman" + 0.003*"paris" + 0.003*"friend" + 0.003*"first" + 0.003*"girls" + 0.003*"mitchell" + 0.003*"reeves" + 0.003*"women" + 0.003*"cruise" + 0.002*"could"'), (5, '0.005*"action" + 0.005*"really" + 0.005*"barbie" + 0.005*"characters" + 0.004*"people" + 0.004*"would" + 0.003*"still" + 0.003*"makes" + 0.003*"something" + 0.003*"going"'), (6, '0.007*"house" + 0.004*"horror" + 0.004*"story" + 0.003*"christine" + 0.003*"first" + 0.003*"grace" + 0.003*"james" + 0.003*"young" + 0.003*"kidman" + 0.003*"would"'), (7, '0.004*"would" + 0.003*"girls" + 0.003*"really" + 0.003*"conjuring" + 0.003*"horror" + 0.003*"comedy" + 0.003*"school" + 0.003*"director" + 0.002*"batman" + 0.002*"never"'), (8, '0.005*"school" + 0.004*"would" + 0.003*"first" + 0.003*"inception" + 0.003*"action" + 0.003*"genre" + 0.003*"girls" + 0.003*"story" + 0.003*"still" + 0.003*"could"'), (9, '0.008*"fallen" + 0.005*"angel" + 0.005*"loretta" + 0.005*"president" + 0.004*"banning" + 0.004*"bullock" + 0.003*"could" + 0.003*"still" + 0.003*"people" + 0.003*"action"'), (10, '0.009*"batman" + 0.006*"nolan" + 0.004*"characters" + 0.004*"reeves" + 0.003*"could" + 0.003*"story" + 0.003*"years" + 0.003*"first" + 0.003*"school" + 0.003*"dream"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.005*"house" + 0.004*"character" + 0.004*"banning" + 0.004*"fallen" + 0.003*"angel" + 0.003*"children" + 0.003*"conjuring" + 0.003*"story" + 0.003*"director" + 0.003*"grace"'), (1, '0.012*"barbie" + 0.006*"would" + 0.004*"jason" + 0.004*"first" + 0.003*"comedy" + 0.003*"daughter" + 0.003*"story" + 0.003*"holmes" + 0.003*"still" + 0.003*"never"'), (2, '0.007*"action" + 0.004*"holmes" + 0.004*"world" + 0.003*"fallen" + 0.003*"batman" + 0.003*"cruise" + 0.003*"great" + 0.003*"years" + 0.003*"story" + 0.003*"oblivion"'), (3, '0.005*"action" + 0.005*"world" + 0.005*"story" + 0.004*"women" + 0.004*"years" + 0.003*"harper" + 0.003*"would" + 0.003*"people" + 0.003*"first" + 0.003*"craig"'), (4, '0.004*"batman" + 0.003*"still" + 0.003*"paris" + 0.003*"friend" + 0.003*"fresh" + 0.003*"without" + 0.003*"going" + 0.003*"action" + 0.003*"girls" + 0.003*"parents"'), (5, '0.005*"really" + 0.004*"something" + 0.004*"makes" + 0.004*"would" + 0.004*"story" + 0.004*"characters" + 0.003*"action" + 0.003*"planet" + 0.003*"people" + 0.002*"black"'), (6, '0.004*"first" + 0.004*"story" + 0.004*"would" + 0.003*"characters" + 0.003*"character" + 0.003*"loretta" + 0.003*"grandpa" + 0.003*"chapter" + 0.003*"riddick" + 0.003*"emotions"'), (7, '0.005*"girls" + 0.004*"school" + 0.003*"french" + 0.003*"comedy" + 0.003*"steve" + 0.003*"great" + 0.003*"really" + 0.003*"never" + 0.003*"daniel" + 0.003*"story"'), (8, '0.005*"school" + 0.004*"first" + 0.003*"genre" + 0.003*"would" + 0.003*"world" + 0.003*"action" + 0.003*"girls" + 0.003*"spielberg" + 0.003*"almost" + 0.003*"american"'), (9, '0.006*"president" + 0.006*"fallen" + 0.005*"loretta" + 0.005*"banning" + 0.004*"bullock" + 0.004*"could" + 0.003*"angel" + 0.003*"still" + 0.003*"played" + 0.002*"action"'), (10, '0.007*"batman" + 0.006*"nolan" + 0.005*"could" + 0.003*"years" + 0.003*"characters" + 0.003*"school" + 0.003*"reeves" + 0.003*"played" + 0.003*"character" + 0.003*"never"'), (11, '0.007*"house" + 0.006*"horror" + 0.004*"would" + 0.004*"inception" + 0.004*"kidman" + 0.004*"grace" + 0.004*"raimi" + 0.003*"could" + 0.003*"woman" + 0.003*"years"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.005*"house" + 0.003*"character" + 0.003*"children" + 0.003*"story" + 0.003*"conjuring" + 0.003*"grace" + 0.003*"characters" + 0.003*"franchise" + 0.003*"horror" + 0.003*"never"'), (1, '0.011*"barbie" + 0.006*"jason" + 0.005*"would" + 0.004*"comedy" + 0.004*"first" + 0.003*"grandpa" + 0.003*"world" + 0.003*"holmes" + 0.003*"dirty" + 0.003*"watson"'), (2, '0.007*"action" + 0.004*"great" + 0.004*"world" + 0.003*"batman" + 0.003*"fallen" + 0.003*"oblivion" + 0.003*"first" + 0.003*"story" + 0.002*"neeson" + 0.002*"french"'), (3, '0.006*"action" + 0.005*"world" + 0.005*"women" + 0.004*"years" + 0.004*"story" + 0.004*"first" + 0.003*"barbie" + 0.003*"harper" + 0.003*"james" + 0.003*"still"'), (4, '0.006*"batman" + 0.004*"paris" + 0.003*"girls" + 0.003*"parents" + 0.003*"friend" + 0.003*"going" + 0.003*"reeves" + 0.003*"mitchell" + 0.003*"action" + 0.003*"without"'), (5, '0.005*"would" + 0.004*"really" + 0.004*"barbie" + 0.004*"story" + 0.004*"characters" + 0.004*"makes" + 0.003*"going" + 0.003*"people" + 0.003*"girls" + 0.003*"something"'), (6, '0.004*"story" + 0.004*"house" + 0.003*"first" + 0.003*"conjuring" + 0.003*"character" + 0.003*"lorraine" + 0.003*"plays" + 0.003*"riddick" + 0.003*"janet" + 0.003*"around"'), (7, '0.006*"girls" + 0.005*"school" + 0.004*"batman" + 0.004*"steve" + 0.003*"really" + 0.003*"janis" + 0.003*"would" + 0.003*"lohan" + 0.003*"never" + 0.003*"comedy"'), (8, '0.004*"action" + 0.004*"genre" + 0.004*"minority" + 0.004*"report" + 0.004*"spielberg" + 0.004*"would" + 0.004*"years" + 0.003*"never" + 0.003*"story" + 0.003*"school"'), (9, '0.010*"fallen" + 0.009*"president" + 0.009*"banning" + 0.007*"angel" + 0.005*"action" + 0.004*"loretta" + 0.004*"butler" + 0.003*"freeman" + 0.003*"could" + 0.003*"olympus"'), (10, '0.007*"batman" + 0.006*"nolan" + 0.005*"could" + 0.004*"characters" + 0.004*"character" + 0.004*"school" + 0.003*"first" + 0.003*"years" + 0.003*"since" + 0.003*"dream"'), (11, '0.005*"would" + 0.005*"holmes" + 0.005*"house" + 0.004*"grace" + 0.004*"inception" + 0.004*"horror" + 0.003*"woman" + 0.003*"fresh" + 0.003*"watson" + 0.003*"still"'), (12, '0.005*"horror" + 0.004*"raimi" + 0.004*"holmes" + 0.004*"watson" + 0.003*"chapter" + 0.003*"young" + 0.003*"christine" + 0.003*"really" + 0.003*"ferrell" + 0.003*"conjuring"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.006*"house" + 0.004*"children" + 0.004*"grace" + 0.003*"conjuring" + 0.003*"story" + 0.003*"really" + 0.003*"franchise" + 0.003*"horror" + 0.003*"another" + 0.003*"around"'), (1, '0.015*"barbie" + 0.004*"story" + 0.004*"robbie" + 0.003*"would" + 0.003*"first" + 0.003*"world" + 0.003*"holmes" + 0.003*"another" + 0.003*"gosling" + 0.003*"watson"'), (2, '0.007*"action" + 0.004*"great" + 0.004*"world" + 0.004*"fallen" + 0.003*"batman" + 0.003*"first" + 0.002*"years" + 0.002*"attempts" + 0.002*"neeson" + 0.002*"though"'), (3, '0.006*"women" + 0.005*"world" + 0.005*"action" + 0.004*"people" + 0.003*"barbie" + 0.003*"story" + 0.003*"years" + 0.003*"actually" + 0.003*"still" + 0.003*"would"'), (4, '0.007*"school" + 0.006*"batman" + 0.004*"regina" + 0.004*"janis" + 0.004*"girls" + 0.003*"still" + 0.003*"plastics" + 0.003*"parents" + 0.003*"cliques" + 0.003*"friend"'), (5, '0.004*"action" + 0.004*"would" + 0.004*"story" + 0.004*"makes" + 0.003*"barbie" + 0.003*"characters" + 0.003*"people" + 0.003*"still" + 0.003*"something" + 0.003*"planet"'), (6, '0.006*"story" + 0.005*"chapter" + 0.004*"horror" + 0.004*"conjuring" + 0.004*"pennywise" + 0.004*"first" + 0.004*"house" + 0.003*"years" + 0.003*"james" + 0.003*"characters"'), (7, '0.005*"girls" + 0.004*"steve" + 0.004*"fresh" + 0.003*"would" + 0.003*"story" + 0.003*"comedy" + 0.003*"dating" + 0.003*"daughter" + 0.003*"could" + 0.003*"batman"'), (8, '0.005*"school" + 0.004*"story" + 0.004*"french" + 0.004*"years" + 0.004*"would" + 0.004*"genre" + 0.004*"action" + 0.004*"minority" + 0.004*"report" + 0.003*"spielberg"'), (9, '0.011*"fallen" + 0.010*"president" + 0.010*"banning" + 0.008*"angel" + 0.005*"loretta" + 0.004*"butler" + 0.004*"action" + 0.004*"service" + 0.004*"secret" + 0.004*"could"'), (10, '0.008*"batman" + 0.007*"nolan" + 0.004*"characters" + 0.004*"could" + 0.004*"years" + 0.004*"dream" + 0.003*"reeves" + 0.003*"story" + 0.003*"character" + 0.003*"since"'), (11, '0.005*"house" + 0.004*"inception" + 0.004*"holmes" + 0.004*"would" + 0.004*"watson" + 0.004*"every" + 0.004*"grace" + 0.003*"horror" + 0.003*"could" + 0.003*"woman"'), (12, '0.006*"christine" + 0.006*"holmes" + 0.005*"never" + 0.004*"watson" + 0.004*"really" + 0.003*"conjuring" + 0.003*"horror" + 0.003*"awful" + 0.003*"ferrell" + 0.003*"funny"'), (13, '0.008*"jason" + 0.006*"grandpa" + 0.005*"character" + 0.005*"would" + 0.005*"dirty" + 0.004*"robert" + 0.004*"craig" + 0.004*"efron" + 0.003*"girls" + 0.003*"first"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.006*"house" + 0.005*"banning" + 0.005*"president" + 0.004*"first" + 0.004*"children" + 0.004*"grace" + 0.003*"franchise" + 0.003*"character" + 0.003*"around" + 0.003*"story"'), (1, '0.009*"barbie" + 0.004*"first" + 0.004*"comedy" + 0.003*"story" + 0.003*"would" + 0.003*"years" + 0.003*"minutes" + 0.003*"another" + 0.003*"could" + 0.003*"whose"'), (2, '0.008*"action" + 0.005*"great" + 0.005*"batman" + 0.004*"world" + 0.004*"story" + 0.004*"reeves" + 0.004*"cruise" + 0.003*"fallen" + 0.003*"earth" + 0.003*"human"'), (3, '0.005*"action" + 0.004*"women" + 0.004*"world" + 0.003*"black" + 0.003*"people" + 0.003*"harper" + 0.003*"story" + 0.003*"chapter" + 0.003*"horror" + 0.003*"really"'), (4, '0.005*"school" + 0.005*"girls" + 0.003*"going" + 0.003*"comes" + 0.003*"spielberg" + 0.003*"friend" + 0.003*"still" + 0.003*"parents" + 0.003*"times" + 0.003*"place"'), (5, '0.005*"would" + 0.004*"really" + 0.004*"story" + 0.004*"makes" + 0.004*"characters" + 0.004*"people" + 0.003*"something" + 0.003*"planet" + 0.003*"action" + 0.003*"black"'), (6, '0.008*"conjuring" + 0.005*"horror" + 0.005*"lorraine" + 0.005*"house" + 0.004*"story" + 0.004*"characters" + 0.004*"young" + 0.003*"janet" + 0.003*"warrens" + 0.003*"first"'), (7, '0.004*"steve" + 0.004*"really" + 0.003*"batman" + 0.003*"fresh" + 0.003*"character" + 0.003*"dating" + 0.003*"every" + 0.003*"characters" + 0.003*"takes" + 0.002*"funny"'), (8, '0.005*"girls" + 0.005*"would" + 0.004*"action" + 0.004*"school" + 0.004*"american" + 0.004*"genre" + 0.004*"people" + 0.003*"french" + 0.003*"first" + 0.003*"years"'), (9, '0.009*"fallen" + 0.007*"president" + 0.006*"angel" + 0.006*"banning" + 0.005*"action" + 0.004*"loretta" + 0.004*"service" + 0.004*"secret" + 0.003*"butler" + 0.003*"could"'), (10, '0.008*"batman" + 0.006*"nolan" + 0.005*"school" + 0.004*"years" + 0.003*"characters" + 0.003*"could" + 0.003*"first" + 0.003*"reeves" + 0.003*"audiences" + 0.003*"never"'), (11, '0.005*"house" + 0.005*"holmes" + 0.005*"inception" + 0.004*"grace" + 0.004*"would" + 0.004*"woman" + 0.003*"kidman" + 0.003*"could" + 0.003*"still" + 0.003*"years"'), (12, '0.017*"holmes" + 0.014*"watson" + 0.009*"ferrell" + 0.009*"reilly" + 0.006*"sherlock" + 0.005*"brothers" + 0.005*"never" + 0.005*"raimi" + 0.005*"christine" + 0.004*"director"'), (13, '0.007*"craig" + 0.007*"character" + 0.005*"barbie" + 0.005*"would" + 0.004*"james" + 0.003*"jason" + 0.003*"first" + 0.003*"robert" + 0.003*"little" + 0.003*"batman"'), (14, '0.007*"world" + 0.005*"barbie" + 0.005*"would" + 0.005*"story" + 0.004*"never" + 0.004*"girls" + 0.004*"first" + 0.003*"years" + 0.003*"anderton" + 0.003*"fresh"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.006*"fallen" + 0.005*"house" + 0.005*"conjuring" + 0.005*"banning" + 0.005*"angel" + 0.004*"director" + 0.004*"president" + 0.004*"story" + 0.004*"franchise" + 0.004*"london"'), (1, '0.008*"barbie" + 0.008*"jason" + 0.005*"comedy" + 0.005*"grandpa" + 0.004*"dirty" + 0.004*"holmes" + 0.004*"first" + 0.004*"world" + 0.004*"watson" + 0.003*"another"'), (2, '0.007*"action" + 0.004*"world" + 0.004*"great" + 0.004*"earth" + 0.004*"story" + 0.003*"fallen" + 0.003*"oblivion" + 0.003*"cruise" + 0.003*"planet" + 0.003*"batman"'), (3, '0.006*"action" + 0.005*"world" + 0.005*"women" + 0.004*"harper" + 0.004*"years" + 0.004*"story" + 0.003*"dream" + 0.003*"people" + 0.003*"would" + 0.003*"still"'), (4, '0.004*"paris" + 0.003*"girls" + 0.003*"friend" + 0.003*"women" + 0.003*"actually" + 0.003*"could" + 0.003*"going" + 0.003*"action" + 0.003*"cruise" + 0.003*"oblivion"'), (5, '0.005*"makes" + 0.004*"planet" + 0.004*"characters" + 0.003*"really" + 0.003*"black" + 0.003*"people" + 0.003*"world" + 0.003*"pitch" + 0.003*"character" + 0.003*"horror"'), (6, '0.007*"house" + 0.007*"conjuring" + 0.004*"lorraine" + 0.004*"story" + 0.004*"janet" + 0.004*"children" + 0.004*"riddick" + 0.004*"character" + 0.004*"around" + 0.003*"emotions"'), (7, '0.004*"girls" + 0.004*"school" + 0.004*"really" + 0.003*"every" + 0.003*"characters" + 0.003*"people" + 0.003*"steve" + 0.003*"character" + 0.003*"played" + 0.003*"story"'), (8, '0.006*"french" + 0.005*"genre" + 0.005*"action" + 0.004*"american" + 0.004*"would" + 0.004*"years" + 0.004*"better" + 0.004*"school" + 0.003*"people" + 0.003*"really"'), (9, '0.007*"batman" + 0.007*"fallen" + 0.006*"president" + 0.004*"angel" + 0.004*"still" + 0.004*"could" + 0.004*"banning" + 0.003*"played" + 0.003*"school" + 0.003*"action"'), (10, '0.005*"could" + 0.005*"school" + 0.004*"nolan" + 0.003*"first" + 0.003*"years" + 0.003*"character" + 0.003*"played" + 0.003*"batman" + 0.003*"feels" + 0.003*"never"'), (11, '0.005*"inception" + 0.005*"house" + 0.005*"would" + 0.004*"grace" + 0.004*"woman" + 0.004*"horror" + 0.004*"barbie" + 0.003*"people" + 0.003*"takes" + 0.003*"still"'), (12, '0.017*"christine" + 0.013*"raimi" + 0.009*"horror" + 0.008*"lohman" + 0.005*"gypsy" + 0.005*"raver" + 0.005*"ganush" + 0.004*"funny" + 0.004*"curse" + 0.004*"alison"'), (13, '0.005*"character" + 0.005*"craig" + 0.005*"would" + 0.004*"girls" + 0.004*"daughter" + 0.004*"first" + 0.003*"james" + 0.003*"barbie" + 0.003*"batman" + 0.003*"raimi"'), (14, '0.006*"would" + 0.006*"story" + 0.006*"world" + 0.005*"never" + 0.004*"anderton" + 0.004*"comedy" + 0.004*"first" + 0.004*"fresh" + 0.003*"dream" + 0.003*"report"'), (15, '0.010*"holmes" + 0.006*"watson" + 0.005*"chapter" + 0.004*"first" + 0.004*"years" + 0.004*"sherlock" + 0.004*"loretta" + 0.004*"batman" + 0.004*"ferrell" + 0.004*"whose"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.005*"story" + 0.004*"conjuring" + 0.004*"house" + 0.003*"franchise" + 0.003*"thing" + 0.003*"first" + 0.003*"director" + 0.003*"screen" + 0.003*"character" + 0.003*"might"'), (1, '0.014*"barbie" + 0.009*"jason" + 0.005*"grandpa" + 0.004*"comedy" + 0.004*"dirty" + 0.003*"story" + 0.003*"robbie" + 0.003*"world" + 0.003*"grandson" + 0.003*"first"'), (2, '0.006*"action" + 0.005*"batman" + 0.004*"great" + 0.004*"story" + 0.003*"neeson" + 0.003*"would" + 0.003*"oblivion" + 0.003*"played" + 0.003*"world" + 0.002*"french"'), (3, '0.007*"action" + 0.006*"world" + 0.005*"women" + 0.004*"barbie" + 0.004*"story" + 0.004*"harper" + 0.004*"black" + 0.003*"people" + 0.003*"dream" + 0.003*"characters"'), (4, '0.007*"batman" + 0.004*"paris" + 0.003*"friend" + 0.003*"girls" + 0.003*"cruise" + 0.003*"reeves" + 0.003*"mitchell" + 0.003*"parents" + 0.003*"young" + 0.003*"within"'), (5, '0.006*"really" + 0.005*"action" + 0.005*"planet" + 0.004*"characters" + 0.004*"makes" + 0.003*"horror" + 0.003*"people" + 0.003*"start" + 0.003*"french" + 0.003*"still"'), (6, '0.007*"house" + 0.006*"conjuring" + 0.005*"lorraine" + 0.004*"horror" + 0.004*"story" + 0.004*"around" + 0.004*"children" + 0.004*"first" + 0.004*"riddick" + 0.003*"janet"'), (7, '0.007*"school" + 0.006*"girls" + 0.004*"steve" + 0.003*"lohan" + 0.003*"janis" + 0.003*"first" + 0.003*"comedy" + 0.003*"never" + 0.003*"daughter" + 0.003*"regina"'), (8, '0.006*"school" + 0.004*"girls" + 0.004*"first" + 0.004*"would" + 0.004*"genre" + 0.003*"story" + 0.003*"almost" + 0.003*"spielberg" + 0.003*"romance" + 0.003*"sense"'), (9, '0.017*"fallen" + 0.013*"president" + 0.012*"banning" + 0.011*"angel" + 0.007*"action" + 0.007*"butler" + 0.006*"olympus" + 0.005*"service" + 0.005*"secret" + 0.005*"freeman"'), (10, '0.007*"batman" + 0.006*"nolan" + 0.006*"could" + 0.005*"years" + 0.003*"world" + 0.003*"great" + 0.003*"action" + 0.003*"better" + 0.003*"characters" + 0.003*"planet"'), (11, '0.006*"would" + 0.005*"woman" + 0.005*"inception" + 0.004*"house" + 0.004*"holmes" + 0.004*"every" + 0.004*"horror" + 0.004*"women" + 0.003*"still" + 0.003*"world"'), (12, '0.011*"christine" + 0.007*"raimi" + 0.005*"lohman" + 0.005*"watson" + 0.005*"holmes" + 0.004*"never" + 0.004*"really" + 0.004*"something" + 0.004*"years" + 0.004*"raver"'), (13, '0.006*"batman" + 0.006*"craig" + 0.005*"character" + 0.004*"james" + 0.004*"action" + 0.003*"daughter" + 0.003*"going" + 0.003*"would" + 0.003*"daniel" + 0.003*"girls"'), (14, '0.006*"would" + 0.005*"story" + 0.005*"fresh" + 0.005*"world" + 0.004*"never" + 0.004*"edgarjones" + 0.004*"anderton" + 0.004*"dream" + 0.003*"knows" + 0.003*"years"'), (15, '0.006*"holmes" + 0.005*"chapter" + 0.005*"first" + 0.005*"loretta" + 0.004*"watson" + 0.004*"raimi" + 0.004*"horror" + 0.004*"would" + 0.004*"years" + 0.003*"pennywise"'), (16, '0.008*"character" + 0.007*"characters" + 0.004*"monsters" + 0.004*"people" + 0.002*"conjuring" + 0.002*"genre" + 0.002*"riddick" + 0.002*"french" + 0.002*"grandfather" + 0.002*"jason"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.008*"house" + 0.006*"children" + 0.006*"grace" + 0.004*"story" + 0.004*"woman" + 0.003*"franchise" + 0.003*"horror" + 0.003*"conjuring" + 0.003*"servants" + 0.003*"might"'), (1, '0.011*"barbie" + 0.003*"first" + 0.003*"robbie" + 0.003*"comedy" + 0.003*"world" + 0.003*"holmes" + 0.003*"never" + 0.003*"performances" + 0.003*"every" + 0.003*"watson"'), (2, '0.006*"action" + 0.004*"would" + 0.004*"story" + 0.003*"batman" + 0.003*"neeson" + 0.003*"human" + 0.003*"planet" + 0.003*"cruise" + 0.003*"years" + 0.003*"attempts"'), (3, '0.005*"action" + 0.005*"world" + 0.005*"women" + 0.004*"story" + 0.003*"people" + 0.003*"harper" + 0.003*"barbie" + 0.003*"craig" + 0.003*"jokes" + 0.003*"years"'), (4, '0.012*"girls" + 0.011*"school" + 0.008*"fresh" + 0.008*"steve" + 0.007*"regina" + 0.007*"dating" + 0.005*"parents" + 0.005*"friend" + 0.005*"cliques" + 0.005*"janis"'), (5, '0.005*"fallen" + 0.005*"action" + 0.005*"really" + 0.004*"planet" + 0.004*"black" + 0.003*"makes" + 0.003*"pitch" + 0.003*"characters" + 0.003*"going" + 0.003*"people"'), (6, '0.006*"story" + 0.005*"would" + 0.005*"riddick" + 0.005*"emotions" + 0.004*"house" + 0.004*"anderton" + 0.004*"young" + 0.003*"character" + 0.003*"around" + 0.003*"however"'), (7, '0.004*"really" + 0.004*"something" + 0.004*"future" + 0.004*"character" + 0.003*"preston" + 0.003*"takes" + 0.003*"characters" + 0.003*"world" + 0.003*"equilibrium" + 0.003*"either"'), (8, '0.006*"school" + 0.005*"people" + 0.004*"would" + 0.004*"action" + 0.004*"girls" + 0.004*"american" + 0.004*"genre" + 0.004*"first" + 0.003*"really" + 0.003*"sense"'), (9, '0.012*"president" + 0.012*"banning" + 0.010*"fallen" + 0.008*"angel" + 0.005*"butler" + 0.005*"action" + 0.005*"service" + 0.004*"secret" + 0.004*"played" + 0.004*"still"'), (10, '0.008*"batman" + 0.006*"nolan" + 0.005*"years" + 0.004*"school" + 0.003*"could" + 0.003*"planet" + 0.003*"anything" + 0.003*"audiences" + 0.003*"comedy" + 0.003*"never"'), (11, '0.006*"inception" + 0.006*"house" + 0.006*"would" + 0.005*"grace" + 0.005*"woman" + 0.004*"jason" + 0.004*"kidman" + 0.004*"nolan" + 0.004*"horror" + 0.004*"every"'), (12, '0.005*"really" + 0.005*"holmes" + 0.005*"watson" + 0.004*"christine" + 0.004*"grandpa" + 0.004*"ferrell" + 0.003*"years" + 0.003*"never" + 0.003*"brothers" + 0.003*"reilly"'), (13, '0.005*"craig" + 0.004*"character" + 0.004*"would" + 0.004*"never" + 0.004*"batman" + 0.004*"daughter" + 0.003*"first" + 0.003*"james" + 0.003*"girls" + 0.003*"however"'), (14, '0.005*"story" + 0.005*"comedy" + 0.005*"years" + 0.005*"never" + 0.004*"first" + 0.004*"dream" + 0.004*"world" + 0.004*"girls" + 0.004*"would" + 0.003*"knows"'), (15, '0.009*"holmes" + 0.006*"watson" + 0.006*"chapter" + 0.005*"loretta" + 0.004*"first" + 0.004*"horror" + 0.004*"raimi" + 0.004*"ferrell" + 0.004*"would" + 0.003*"reilly"'), (16, '0.014*"conjuring" + 0.008*"lorraine" + 0.005*"horror" + 0.005*"house" + 0.005*"warrens" + 0.005*"characters" + 0.005*"hodgson" + 0.005*"james" + 0.004*"janet" + 0.004*"farmiga"'), (17, '0.007*"barbie" + 0.005*"action" + 0.005*"great" + 0.004*"batman" + 0.004*"world" + 0.004*"characters" + 0.003*"french" + 0.003*"story" + 0.003*"would" + 0.003*"nothing"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.009*"house" + 0.006*"grace" + 0.006*"children" + 0.004*"horror" + 0.003*"conjuring" + 0.003*"servants" + 0.003*"woman" + 0.003*"always" + 0.003*"story" + 0.003*"franchise"'), (1, '0.010*"barbie" + 0.007*"jason" + 0.005*"steve" + 0.005*"grandpa" + 0.005*"dirty" + 0.004*"comedy" + 0.004*"world" + 0.003*"robbie" + 0.003*"dating" + 0.003*"story"'), (2, '0.007*"action" + 0.005*"story" + 0.004*"batman" + 0.004*"great" + 0.004*"would" + 0.003*"years" + 0.003*"neeson" + 0.003*"world" + 0.003*"reeves" + 0.003*"holmes"'), (3, '0.009*"action" + 0.006*"world" + 0.006*"women" + 0.005*"harper" + 0.004*"black" + 0.004*"story" + 0.004*"people" + 0.004*"barbie" + 0.004*"cruise" + 0.003*"order"'), (4, '0.005*"paris" + 0.004*"friend" + 0.004*"girls" + 0.004*"batman" + 0.004*"cruise" + 0.003*"oblivion" + 0.003*"special" + 0.003*"daughter" + 0.003*"within" + 0.003*"family"'), (5, '0.005*"makes" + 0.004*"really" + 0.004*"world" + 0.004*"characters" + 0.004*"black" + 0.004*"character" + 0.003*"almost" + 0.003*"pitch" + 0.003*"edgarjones" + 0.003*"development"'), (6, '0.005*"story" + 0.005*"would" + 0.005*"riddick" + 0.004*"emotions" + 0.004*"black" + 0.004*"pitch" + 0.004*"anderton" + 0.004*"however" + 0.004*"loretta" + 0.004*"character"'), (7, '0.008*"school" + 0.008*"girls" + 0.004*"janis" + 0.004*"lohan" + 0.004*"regina" + 0.003*"future" + 0.003*"every" + 0.003*"plastics" + 0.003*"comes" + 0.003*"something"'), (8, '0.005*"genre" + 0.005*"action" + 0.004*"would" + 0.004*"american" + 0.004*"french" + 0.004*"school" + 0.004*"really" + 0.003*"better" + 0.003*"romance" + 0.003*"story"'), (9, '0.016*"fallen" + 0.013*"president" + 0.013*"banning" + 0.010*"angel" + 0.007*"action" + 0.006*"butler" + 0.006*"olympus" + 0.005*"service" + 0.005*"secret" + 0.005*"freeman"'), (10, '0.007*"school" + 0.006*"nolan" + 0.006*"batman" + 0.006*"could" + 0.004*"years" + 0.004*"around" + 0.004*"never" + 0.004*"feels" + 0.004*"played" + 0.003*"screen"'), (11, '0.007*"inception" + 0.005*"holmes" + 0.005*"fresh" + 0.004*"steve" + 0.004*"would" + 0.004*"still" + 0.004*"nolan" + 0.004*"could" + 0.004*"people" + 0.004*"think"'), (12, '0.009*"raimi" + 0.007*"christine" + 0.006*"horror" + 0.004*"really" + 0.004*"never" + 0.004*"gypsy" + 0.003*"young" + 0.003*"funny" + 0.003*"watson" + 0.003*"director"'), (13, '0.005*"would" + 0.005*"daughter" + 0.004*"character" + 0.004*"batman" + 0.004*"first" + 0.003*"craig" + 0.003*"still" + 0.003*"robert" + 0.003*"girls" + 0.003*"little"'), (14, '0.005*"dream" + 0.005*"first" + 0.005*"never" + 0.005*"knows" + 0.004*"comedy" + 0.004*"years" + 0.004*"world" + 0.004*"since" + 0.004*"fresh" + 0.003*"instead"'), (15, '0.009*"holmes" + 0.007*"chapter" + 0.006*"watson" + 0.005*"loretta" + 0.005*"ferrell" + 0.004*"first" + 0.004*"reilly" + 0.004*"story" + 0.004*"would" + 0.004*"years"'), (16, '0.006*"character" + 0.005*"conjuring" + 0.005*"characters" + 0.005*"horror" + 0.004*"monsters" + 0.004*"lorraine" + 0.004*"woman" + 0.004*"young" + 0.003*"instead" + 0.003*"jason"'), (17, '0.010*"barbie" + 0.005*"would" + 0.005*"world" + 0.005*"characters" + 0.004*"never" + 0.004*"action" + 0.004*"nothing" + 0.004*"every" + 0.003*"story" + 0.003*"great"'), (18, '0.007*"house" + 0.005*"characters" + 0.005*"first" + 0.004*"planet" + 0.004*"really" + 0.004*"would" + 0.004*"character" + 0.004*"three" + 0.004*"james" + 0.003*"story"')]
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.005*"story" + 0.005*"conjuring" + 0.004*"house" + 0.004*"cruise" + 0.003*"around" + 0.003*"another" + 0.003*"character" + 0.003*"could" + 0.003*"nolan" + 0.003*"along"'), (1, '0.011*"barbie" + 0.005*"jason" + 0.004*"world" + 0.004*"robbie" + 0.004*"grace" + 0.004*"dirty" + 0.004*"grandpa" + 0.003*"mills" + 0.003*"house" + 0.003*"another"'), (2, '0.009*"action" + 0.005*"world" + 0.005*"great" + 0.004*"fallen" + 0.004*"batman" + 0.004*"story" + 0.003*"oblivion" + 0.003*"years" + 0.003*"neeson" + 0.003*"human"'), (3, '0.007*"action" + 0.005*"world" + 0.004*"women" + 0.004*"harper" + 0.004*"story" + 0.004*"barbie" + 0.003*"horror" + 0.003*"woman" + 0.003*"years" + 0.003*"dream"'), (4, '0.010*"paris" + 0.006*"oblivion" + 0.005*"girls" + 0.005*"cruise" + 0.004*"could" + 0.004*"daughter" + 0.004*"laugh" + 0.004*"excia" + 0.004*"staying" + 0.004*"start"'), (5, '0.005*"makes" + 0.005*"would" + 0.005*"really" + 0.004*"planet" + 0.004*"black" + 0.004*"story" + 0.003*"pitch" + 0.003*"characters" + 0.003*"world" + 0.003*"going"'), (6, '0.006*"house" + 0.005*"story" + 0.005*"would" + 0.004*"first" + 0.004*"conjuring" + 0.004*"young" + 0.004*"emotions" + 0.003*"anderton" + 0.003*"times" + 0.003*"horror"'), (7, '0.006*"school" + 0.004*"steve" + 0.004*"regina" + 0.004*"janis" + 0.004*"french" + 0.004*"fresh" + 0.003*"daughter" + 0.003*"girls" + 0.003*"going" + 0.003*"daniel"'), (8, '0.006*"people" + 0.005*"action" + 0.005*"american" + 0.004*"genre" + 0.004*"french" + 0.004*"would" + 0.004*"school" + 0.003*"romance" + 0.003*"world" + 0.003*"first"'), (9, '0.013*"president" + 0.012*"banning" + 0.010*"fallen" + 0.007*"angel" + 0.006*"butler" + 0.005*"service" + 0.005*"secret" + 0.005*"action" + 0.005*"loretta" + 0.004*"christine"'), (10, '0.006*"could" + 0.006*"nolan" + 0.005*"years" + 0.005*"school" + 0.004*"never" + 0.004*"batman" + 0.004*"feels" + 0.003*"without" + 0.003*"since" + 0.003*"played"'), (11, '0.006*"inception" + 0.005*"fresh" + 0.005*"would" + 0.005*"horror" + 0.004*"still" + 0.004*"house" + 0.004*"women" + 0.004*"woman" + 0.003*"years" + 0.003*"jason"'), (12, '0.006*"christine" + 0.006*"something" + 0.005*"spielberg" + 0.004*"watson" + 0.004*"raimi" + 0.004*"years" + 0.004*"funny" + 0.004*"holmes" + 0.004*"director" + 0.004*"preston"'), (13, '0.004*"character" + 0.004*"would" + 0.004*"going" + 0.003*"dirty" + 0.003*"grandpa" + 0.003*"almost" + 0.003*"things" + 0.003*"batman" + 0.003*"barbie" + 0.003*"however"'), (14, '0.005*"girls" + 0.004*"never" + 0.004*"first" + 0.004*"fresh" + 0.004*"dream" + 0.004*"world" + 0.004*"years" + 0.004*"would" + 0.004*"knows" + 0.004*"horror"'), (15, '0.008*"holmes" + 0.006*"watson" + 0.006*"chapter" + 0.006*"first" + 0.005*"ferrell" + 0.005*"horror" + 0.005*"raimi" + 0.004*"reilly" + 0.004*"pennywise" + 0.004*"years"'), (16, '0.006*"conjuring" + 0.006*"monsters" + 0.005*"lorraine" + 0.005*"characters" + 0.004*"riddick" + 0.003*"james" + 0.003*"character" + 0.003*"paranormal" + 0.003*"alien" + 0.003*"interesting"'), (17, '0.009*"barbie" + 0.004*"world" + 0.004*"action" + 0.004*"would" + 0.004*"story" + 0.003*"batman" + 0.003*"first" + 0.003*"still" + 0.003*"people" + 0.003*"scifi"'), (18, '0.011*"batman" + 0.006*"characters" + 0.005*"really" + 0.005*"first" + 0.004*"funny" + 0.004*"house" + 0.003*"would" + 0.003*"crime" + 0.003*"three" + 0.003*"great"'), (19, '0.011*"holmes" + 0.006*"watson" + 0.005*"craig" + 0.005*"school" + 0.005*"never" + 0.005*"girls" + 0.005*"sherlock" + 0.005*"would" + 0.004*"people" + 0.004*"first"')]
{'2 topics 10 words': 0.2113357156231392, '3 topics 10 words': 0.2158911045631583, '4 topics 10 words': 0.2219581383064782, '5 topics 10 words': 0.2588639645792905, '6 topics 10 words': 0.25513967136377724, '7 topics 10 words': 0.27143261065232555, '8 topics 10 words': 0.2746518216139084, '9 topics 10 words': 0.28166207936137694, '10 topics 10 words': 0.29936122350641936, '11 topics 10 words': 0.2703692928518716, '12 topics 10 words': 0.2595912189558547, '13 topics 10 words': 0.2752380351672653, '14 topics 10 words': 0.29419963805034566, '15 topics 10 words': 0.3156047145205673, '16 topics 10 words': 0.298027508808128, '17 topics 10 words': 0.2930445213502198, '18 topics 10 words': 0.3194648883495045, '19 topics 10 words': 0.30123035429821077, '20 topics 10 words': 0.28249001152812364}
number_of_topics=18
words=20
#model2,dictionary2,index2,doctermmatrix2=create_gensim_lda_model(processed_text,number_of_topics,words,titles)
model2,dictionary2,index2,doctermmatrix2=create_gensim_lda_model(processed_text,number_of_topics, words)
for doc in processed_text:
vec_bow2 = dictionary2.doc2bow(doc)
vec2 = model2[vec_bow2] # convert the query to embedded space
sims2 = index2[vec2] # perform a similarity query against the corpus
#print(list(enumerate(sims2)))
fig, ax = plt.subplots(figsize=(30, 10))
cax = ax.matshow(index2, interpolation='nearest')
ax.grid(True)
plt.xticks(range(len(processed_text)), titles, rotation=90);
plt.yticks(range(len(processed_text)), titles);
fig.colorbar(cax)
plt.show()
WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.008*"house" + 0.006*"children" + 0.006*"grace" + 0.004*"story" + 0.004*"woman" + 0.003*"franchise" + 0.003*"horror" + 0.003*"conjuring" + 0.003*"servants" + 0.003*"might" + 0.003*"always" + 0.003*"despite" + 0.003*"another" + 0.003*"course" + 0.003*"james" + 0.003*"young" + 0.003*"amenabar" + 0.003*"christine" + 0.003*"thing" + 0.002*"enough"'), (1, '0.011*"barbie" + 0.003*"first" + 0.003*"robbie" + 0.003*"comedy" + 0.003*"world" + 0.003*"holmes" + 0.003*"never" + 0.003*"performances" + 0.003*"every" + 0.003*"watson" + 0.003*"another" + 0.003*"story" + 0.003*"loretta" + 0.003*"character" + 0.003*"gosling" + 0.003*"awful" + 0.003*"point" + 0.002*"would" + 0.002*"funny" + 0.002*"watch"'), (2, '0.006*"action" + 0.004*"would" + 0.004*"story" + 0.003*"batman" + 0.003*"neeson" + 0.003*"human" + 0.003*"planet" + 0.003*"cruise" + 0.003*"years" + 0.003*"attempts" + 0.003*"fallen" + 0.003*"world" + 0.003*"character" + 0.003*"taken" + 0.003*"little" + 0.003*"think" + 0.002*"earth" + 0.002*"oblivion" + 0.002*"going" + 0.002*"something"'), (3, '0.005*"action" + 0.005*"world" + 0.005*"women" + 0.004*"story" + 0.003*"people" + 0.003*"harper" + 0.003*"barbie" + 0.003*"craig" + 0.003*"jokes" + 0.003*"years" + 0.003*"really" + 0.003*"could" + 0.003*"dream" + 0.002*"always" + 0.002*"still" + 0.002*"might" + 0.002*"trying" + 0.002*"cruise" + 0.002*"series" + 0.002*"riddick"'), (4, '0.012*"girls" + 0.011*"school" + 0.008*"fresh" + 0.008*"steve" + 0.007*"regina" + 0.007*"dating" + 0.005*"parents" + 0.005*"friend" + 0.005*"cliques" + 0.005*"janis" + 0.004*"paris" + 0.004*"right" + 0.004*"could" + 0.004*"plastics" + 0.004*"enough" + 0.004*"friends" + 0.004*"damian" + 0.004*"taste" + 0.003*"social" + 0.003*"batman"'), (5, '0.005*"fallen" + 0.005*"action" + 0.005*"really" + 0.004*"planet" + 0.004*"black" + 0.003*"makes" + 0.003*"pitch" + 0.003*"characters" + 0.003*"going" + 0.003*"people" + 0.003*"north" + 0.003*"second" + 0.003*"angel" + 0.003*"olympus" + 0.002*"something" + 0.002*"horror" + 0.002*"order" + 0.002*"pilot" + 0.002*"quickly" + 0.002*"passengers"'), (6, '0.006*"story" + 0.005*"would" + 0.005*"riddick" + 0.005*"emotions" + 0.004*"house" + 0.004*"anderton" + 0.004*"young" + 0.003*"character" + 0.003*"around" + 0.003*"however" + 0.003*"raimi" + 0.003*"named" + 0.003*"called" + 0.003*"plays" + 0.002*"mystery" + 0.002*"including" + 0.002*"slowly" + 0.002*"strange" + 0.002*"second" + 0.002*"almost"'), (7, '0.004*"really" + 0.004*"something" + 0.004*"future" + 0.004*"character" + 0.003*"preston" + 0.003*"takes" + 0.003*"characters" + 0.003*"world" + 0.003*"equilibrium" + 0.003*"either" + 0.003*"daughter" + 0.003*"feels" + 0.003*"every" + 0.002*"spielberg" + 0.002*"pretty" + 0.002*"people" + 0.002*"first" + 0.002*"seems" + 0.002*"story" + 0.002*"funny"'), (8, '0.006*"school" + 0.005*"people" + 0.004*"would" + 0.004*"action" + 0.004*"girls" + 0.004*"american" + 0.004*"genre" + 0.004*"first" + 0.003*"really" + 0.003*"sense" + 0.003*"world" + 0.003*"romance" + 0.003*"almost" + 0.003*"human" + 0.003*"still" + 0.003*"watch" + 0.003*"could" + 0.003*"french" + 0.002*"better" + 0.002*"daniel"'), (9, '0.012*"president" + 0.012*"banning" + 0.010*"fallen" + 0.008*"angel" + 0.005*"butler" + 0.005*"action" + 0.005*"service" + 0.004*"secret" + 0.004*"played" + 0.004*"still" + 0.004*"trumbull" + 0.004*"freeman" + 0.003*"morgan" + 0.003*"could" + 0.003*"character" + 0.003*"director" + 0.003*"london" + 0.003*"gerard" + 0.003*"series" + 0.003*"feels"'), (10, '0.008*"batman" + 0.006*"nolan" + 0.005*"years" + 0.004*"school" + 0.003*"could" + 0.003*"planet" + 0.003*"anything" + 0.003*"audiences" + 0.003*"comedy" + 0.003*"never" + 0.003*"different" + 0.003*"better" + 0.003*"without" + 0.003*"director" + 0.003*"romance" + 0.003*"romantic" + 0.003*"reeves" + 0.003*"characters" + 0.002*"might" + 0.002*"story"'), (11, '0.006*"inception" + 0.006*"house" + 0.006*"would" + 0.005*"grace" + 0.005*"woman" + 0.004*"jason" + 0.004*"kidman" + 0.004*"nolan" + 0.004*"horror" + 0.004*"every" + 0.004*"holmes" + 0.004*"could" + 0.003*"others" + 0.003*"years" + 0.003*"still" + 0.003*"seems" + 0.003*"children" + 0.003*"story" + 0.003*"really" + 0.003*"think"'), (12, '0.005*"really" + 0.005*"holmes" + 0.005*"watson" + 0.004*"christine" + 0.004*"grandpa" + 0.004*"ferrell" + 0.003*"years" + 0.003*"never" + 0.003*"brothers" + 0.003*"reilly" + 0.003*"makes" + 0.003*"dirty" + 0.003*"sherlock" + 0.003*"point" + 0.003*"director" + 0.003*"enough" + 0.003*"horror" + 0.003*"raimi" + 0.002*"trying" + 0.002*"world"'), (13, '0.005*"craig" + 0.004*"character" + 0.004*"would" + 0.004*"never" + 0.004*"batman" + 0.004*"daughter" + 0.003*"first" + 0.003*"james" + 0.003*"girls" + 0.003*"however" + 0.003*"robert" + 0.003*"grandpa" + 0.003*"going" + 0.003*"barbie" + 0.003*"perfect" + 0.002*"characters" + 0.002*"action" + 0.002*"things" + 0.002*"dirty" + 0.002*"everything"'), (14, '0.005*"story" + 0.005*"comedy" + 0.005*"years" + 0.005*"never" + 0.004*"first" + 0.004*"dream" + 0.004*"world" + 0.004*"girls" + 0.004*"would" + 0.003*"knows" + 0.003*"rather" + 0.003*"right" + 0.003*"instead" + 0.003*"since" + 0.003*"fresh" + 0.003*"craig" + 0.003*"daniel" + 0.003*"action" + 0.003*"characters" + 0.002*"funny"'), (15, '0.009*"holmes" + 0.006*"watson" + 0.006*"chapter" + 0.005*"loretta" + 0.004*"first" + 0.004*"horror" + 0.004*"raimi" + 0.004*"ferrell" + 0.004*"would" + 0.003*"reilly" + 0.003*"director" + 0.003*"pennywise" + 0.003*"years" + 0.003*"characters" + 0.003*"whose" + 0.003*"story" + 0.003*"together" + 0.003*"bullock" + 0.003*"treasure" + 0.003*"sherlock"'), (16, '0.014*"conjuring" + 0.008*"lorraine" + 0.005*"horror" + 0.005*"house" + 0.005*"warrens" + 0.005*"characters" + 0.005*"hodgson" + 0.005*"james" + 0.004*"janet" + 0.004*"farmiga" + 0.004*"warren" + 0.004*"frances" + 0.004*"peggy" + 0.004*"enfield" + 0.004*"children" + 0.004*"amityville" + 0.004*"wilson" + 0.004*"oconnor" + 0.004*"story" + 0.004*"first"'), (17, '0.007*"barbie" + 0.005*"action" + 0.005*"great" + 0.004*"batman" + 0.004*"world" + 0.004*"characters" + 0.003*"french" + 0.003*"story" + 0.003*"would" + 0.003*"nothing" + 0.003*"first" + 0.003*"people" + 0.003*"still" + 0.003*"scifi" + 0.003*"never" + 0.003*"takes" + 0.003*"going" + 0.002*"little" + 0.002*"place" + 0.002*"scenes"')]
number_of_topics=2
words=20
#model2,dictionary2,index2,doctermmatrix2=create_gensim_lda_model(processed_text,number_of_topics,words,titles)
model2,dictionary2,index2,doctermmatrix2=create_gensim_lda_model(processed_text,number_of_topics, words)
for doc in processed_text:
vec_bow2 = dictionary2.doc2bow(doc)
vec2 = model2[vec_bow2] # convert the query to embedded space
sims2 = index2[vec2] # perform a similarity query against the corpus
#print(list(enumerate(sims2)))
fig, ax = plt.subplots(figsize=(30, 10))
cax = ax.matshow(index2, interpolation='nearest')
ax.grid(True)
plt.xticks(range(len(processed_text)), titles, rotation=90);
plt.yticks(range(len(processed_text)), titles);
fig.colorbar(cax)
plt.show()
WARNING:gensim.models.ldamodel:updated prior is not positive WARNING:gensim.models.ldamodel:updated prior is not positive WARNING:gensim.models.ldamodel:updated prior is not positive WARNING:gensim.models.ldamodel:updated prior is not positive WARNING:gensim.models.ldamodel:updated prior is not positive WARNING:gensim.similarities.docsim:scanning corpus to determine the number of features (consider setting `num_features` explicitly)
[(0, '0.003*"action" + 0.003*"story" + 0.003*"horror" + 0.003*"house" + 0.003*"first" + 0.003*"characters" + 0.002*"character" + 0.002*"school" + 0.002*"years" + 0.002*"still" + 0.002*"would" + 0.002*"people" + 0.002*"could" + 0.002*"really" + 0.002*"director" + 0.002*"something" + 0.002*"world" + 0.002*"fallen" + 0.002*"girls" + 0.002*"raimi"'), (1, '0.004*"would" + 0.003*"barbie" + 0.003*"holmes" + 0.003*"world" + 0.003*"first" + 0.002*"story" + 0.002*"watson" + 0.002*"comedy" + 0.002*"action" + 0.002*"years" + 0.002*"could" + 0.002*"character" + 0.002*"still" + 0.002*"never" + 0.002*"women" + 0.002*"actually" + 0.002*"characters" + 0.002*"played" + 0.002*"craig" + 0.002*"batman"')]